Decomposition of potato yield gap in the Andean north of Peru

A crop modeling approach

Background

This repository is designed to provide comprehensive documentation of the R code used for the analysis of yield gap decomposition, integrating both crop modeling and stochastic frontier analysis.

Libraries and extra R-files

The following libraries were used:

# libraries
library(nasapower)
library(meteor)
library(lubridate)
library(Metrics)
library(dplyr)
library(tidyr)
library(minpack.lm)
library(ggplot2)

Additionally, extra R-files need to be loaded.

# extra R-files
load(url("https://github.com/jninanya/solanumR/raw/refs/heads/main/CropParamsList.Rdata"))
source("https://raw.githubusercontent.com/jninanya/solanumR/refs/heads/main/thermalTime.R")
source("https://raw.githubusercontent.com/jninanya/solanumR/refs/heads/main/SolanumModel.R")

Field experiment and data collection

A field experiment was conducted to determine the potential yield of the five most commonly grown potato varieties in Chugay, La Libertad, Peru: Amarillis, Bretaña, Huevo de Indio, Poderosa, and Yungay. These varieties were managed under optimal growing conditions to assess their yield potential. The trial took place from 24th October 2023 to 18th April 2024, providing insights into how these varieties perform in a local context under “ideal” conditions.

Figura 1: Field experiment to determine potential yield of the most common potato varieties in Chugay Distric, La-Libertad region, Peru.
Figura 1: Field experiment to determine potential yield of the most common potato varieties in Chugay Distric, La-Libertad region, Peru.

A total of five biomass evaluations were carried out, where the plant organs (leaf, stem, root, and tubers) were separated, every two weeks from the tuberization stage to harvest. Additionally, fifteen canopy cover evaluations were conducted from plant emergence to senescence, providing detailed data on the growth and development of the five potato varieties. The dataset related to this field experiment can be downloaded at https://doi.org/10.21223/JPA3NZ.

Biomass data

Lets see the harvest index, i.e., the ratio of tuber biomass over total biomass. Click on code to see the R chunk code for the canopy biomass data.

#---------------------------------------------------------------
# SUMMARY OF BIOMASS DATA
#---------------------------------------------------------------
BD <- read.csv("https://github.com/jninanya/Potato_Yield_Gap/raw/refs/heads/main/Data/biomass_dataset.csv")
#head(BD)

# Constant to convert kg/plant to t/ha
k = (1/1000)*(1/(0.3*1))*(10000/1)*0.001

# Matter concentration of <organ> (MCX)
BD$MCL <- BD$sDLM/BD$sFLM      # L = leaf
BD$MCS <- BD$sDSM/BD$sFSM      # S = stem
BD$MCR <- BD$sDRM/BD$sFRM      # R = root
BD$MCT <- BD$sDTM/BD$sFTM      # T = tuber
BD$DMC <- BD$MCT                 

# Dry <organ> matter (DXM)
BD$DLM <- BD$FLM*BD$MCL
BD$DSM <- BD$FSM*BD$MCS
BD$DRM <- BD$FRM*BD$MCR
BD$DTM <- BD$FTM*BD$MCT

# Fresh <organ> yield (FXY)
BD$FLY <- BD$FLM*k
BD$FSY <- BD$FSM*k
BD$FRY <- BD$FRM*k
BD$FTY <- BD$FTM*k               

# Dry <organ> yield (DXY)
BD$DLY <- BD$DLM*k
BD$DSY <- BD$DSM*k
BD$DRY <- BD$DRM*k
BD$DTY <- BD$DTM*k

# Total dry matter (TDM) and harvest index (HI)
BD$TDM <- BD$DLY + BD$DSY + BD$DRY + BD$DTY
BD$HI <- BD$DTY/BD$TDM

# Summary: mean and standard error for HI
smrBD <- BD %>%
  group_by(CODE, EVAL, DAP) %>%
  summarise(DLY = mean(DLY, na.rm = TRUE),
            DSY = mean(DSY, na.rm = TRUE),
            DRY = mean(DRY, na.rm = TRUE),
            DTY = mean(DTY, na.rm = TRUE),
            FTY = mean(FTY, na.rm = TRUE),
            TDM = mean(TDM, na.rm = TRUE),
            DMC = mean(DMC, na.rm = TRUE),
            N = sum(!is.na(HI)),
            nd = n(),
            HI_mean = mean(HI, na.rm = TRUE),
            HI_se = sd(HI, na.rm = TRUE)/sqrt(N)
    )
smrBD <- as.data.frame(smrBD)

outBD <- smrBD[, c("CODE","DAP","HI_mean")] %>%
  pivot_wider(names_from = CODE, values_from = HI_mean)
outBD

outTDM <- smrBD[, c("CODE","DAP","TDM")] %>%
  pivot_wider(names_from = CODE, values_from = TDM)
outTDM <- as.data.frame(outTDM)

outDMC <- smrBD[, c("CODE","DAP","DMC")] %>%
  pivot_wider(names_from = CODE, values_from = DMC)
outDMC <- as.data.frame(outDMC)

outFTY <- smrBD[, c("CODE","DAP","FTY")] %>%
  pivot_wider(names_from = CODE, values_from = FTY)
outFTY <- as.data.frame(outFTY)

# Data frame for each variety
AMA <- smrBD[smrBD$CODE=="AMA",]
BRE <- smrBD[smrBD$CODE=="BRE",]
HUE <- smrBD[smrBD$CODE=="HUE",]
POD <- smrBD[smrBD$CODE=="POD",]
YUN <- smrBD[smrBD$CODE=="YUN",]

# General plot settings
par(oma    = c(4.5, 4.5, 0.5, 3),  # general margins
    mfrow  = c(1, 5),                # number of sub-figures
    mar    = c(0, 0, 0, 0),          # margins per sub-figure
    family = "serif",                # text family
    lwd    = 1.0,                    # line width
    las    = 1,                      # style of axis labels  
    pch    = 19,                     # plotting points
    cex    = 0.8
)

# Color and y-axis limits
pC <- c("blue","yellow","green","cyan","red") 
yL <- c(0,1) 

# Plot for Amarilis
with(AMA, plot(x=DAP, y=HI_mean, ylim=yL, col=pC[1], axes=FALSE, xlab="", ylab=""))
with(AMA, lines(x=DAP, y=HI_mean, lty=2, col=pC[1]))
box(); axis(2); axis(4, labels=FALSE);axis(1)
mtext(side=2, "Harvest Index", line=3, las=0)

# Plot for Bretana
with(BRE, plot(x=DAP, y=HI_mean, ylim=yL, col=pC[2], axes=FALSE, xlab="", ylab=""))
with(BRE, lines(x=DAP, y=HI_mean, lty=2, col=pC[2]))
box(); axis(2,labels=FALSE); axis(4, labels=FALSE);axis(1)

# Plot for Huevo de Indio
with(HUE, plot(x=DAP, y=HI_mean, ylim=yL, col=pC[3], axes=FALSE, xlab="", ylab=""))
with(HUE, lines(x=DAP, y=HI_mean, lty=2, col=pC[3]))
box(); axis(2,labels=FALSE); axis(4, labels=FALSE);axis(1)
mtext(side=1, "Days after planting", line=3)

# Plot for Poderosa
with(POD, plot(x=DAP, y=HI_mean, ylim=yL, col=pC[4], axes=FALSE, xlab="", ylab=""))
with(POD, lines(x=DAP, y=HI_mean, lty=2, col=pC[4]))
box(); axis(2,labels=FALSE); axis(4, labels=FALSE);axis(1)

# Plot for Yungay
with(YUN, plot(x=DAP, y=HI_mean, ylim=yL, col=pC[5], axes=FALSE, xlab="", ylab=""))
with(YUN, lines(x=DAP, y=HI_mean, lty=2, col=pC[5]))
box(); axis(2,labels=FALSE); axis(4);axis(1)

## # A tibble: 6 × 6
##     DAP    AMA    BRE    HUE    POD    YUN
##   <int>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1    94  0.596  0.208  0.277  0.148  0.245
## 2   101  0.668 NA     NA     NA     NA    
## 3   113  0.727  0.376  0.477  0.405  0.434
## 4   136  0.834  0.599  0.706  0.677  0.674
## 5   154  0.872  0.700  0.803  0.718  0.766
## 6   178 NA      0.729  0.854  0.790  0.856

Canopy cover data

Click on code to see the R chunk code for the canopy cover data.

#---------------------------------------------------------------
# SUMMARY OF CANOPY COVER DATA
#---------------------------------------------------------------
CD <- read.csv("https://github.com/jninanya/Potato_Yield_Gap/raw/refs/heads/main/Data/canopy_cover_dataset.csv")
#head(CD)

# Summary by variety and evaluation
smrCD <- CD %>%
  group_by(CODE, DAP) %>%
  summarise(N = sum(!is.na(CC)),
            nd = n(),
            CC_mean = mean(CC, na.rm = TRUE),
            CC_se = sd(CC, na.rm = TRUE)/sqrt(N),
  )

smrCD <- as.data.frame(smrCD)

# Data frame per variety
outCD <- smrCD[, c("CODE","DAP","CC_mean")] %>%
  pivot_wider(names_from = CODE, values_from = CC_mean)
outCD

# Data frame for each variety
AMA <- smrCD[smrCD$CODE=="AMA",]
BRE <- smrCD[smrCD$CODE=="BRE",]
HUE <- smrCD[smrCD$CODE=="HUE",]
POD <- smrCD[smrCD$CODE=="POD",]
YUN <- smrCD[smrCD$CODE=="YUN",]

# General plot settings
par(oma    = c(4.5, 4.5, 0.5, 3),  # general margins
    mfrow  = c(1, 5),                # number of sub-figures
    mar    = c(0, 0, 0, 0),          # margins per sub-figure
    family = "serif",                # text family
    lwd    = 1.0,                    # line width
    las    = 1,                      # style of axis labels  
    pch    = 19,                     # plotting points
    cex    = 0.8
)

# Color and y-axis limits
pC <- c("blue","yellow","green","cyan","red") 
yL <- c(0,100) 

# Plot for Amarilis
with(AMA, plot(x=DAP, y=CC_mean, ylim=yL, col=pC[1], axes=FALSE, xlab="", ylab=""))
with(AMA, lines(x=DAP, y=CC_mean, lty=2, col=pC[1]))
box(); axis(2); axis(4, labels=FALSE);axis(1)
mtext(side=2, "Canopy cover (%)", line=3, las=0)

# Plot for Bretana
with(BRE, plot(x=DAP, y=CC_mean, ylim=yL, col=pC[2], axes=FALSE, xlab="", ylab=""))
with(BRE, lines(x=DAP, y=CC_mean, lty=2, col=pC[2]))
box(); axis(2,labels=FALSE); axis(4, labels=FALSE);axis(1)

# Plot for Huevo de Indio
with(HUE, plot(x=DAP, y=CC_mean, ylim=yL, col=pC[3], axes=FALSE, xlab="", ylab=""))
with(HUE, lines(x=DAP, y=CC_mean, lty=2, col=pC[3]))
box(); axis(2,labels=FALSE); axis(4, labels=FALSE);axis(1)
mtext(side=1, "Days after planting", line=3)

# Plot for Poderosa
with(POD, plot(x=DAP, y=CC_mean, ylim=yL, col=pC[4], axes=FALSE, xlab="", ylab=""))
with(POD, lines(x=DAP, y=CC_mean, lty=2, col=pC[4]))
box(); axis(2,labels=FALSE); axis(4, labels=FALSE);axis(1)

# Plot for Yungay
with(YUN, plot(x=DAP, y=CC_mean, ylim=yL, col=pC[5], axes=FALSE, xlab="", ylab=""))
with(YUN, lines(x=DAP, y=CC_mean, lty=2, col=pC[5]))
box(); axis(2,labels=FALSE); axis(4);axis(1)

## # A tibble: 15 × 6
##      DAP    AMA   BRE   HUE   POD   YUN
##    <int>  <dbl> <dbl> <dbl> <dbl> <dbl>
##  1    52   9.72  1.61  2.61  7.66   0  
##  2    59  35.8  25.7  18.3  27.0   22.4
##  3    65  45.0  36.6  26.2  41.9   32.8
##  4    67  51.5  42.8  31.4  53.3   38.1
##  5    70  65.4  47.7  39.3  70.3   46.3
##  6    76  78.1  60.1  50.0  86.1   57.7
##  7    80  84.7  67.2  52.9  91.4   66.7
##  8    86  83.7  68.7  50.6  92.9   63.2
##  9    98  88.7  83.9  65.7  95.3   76.8
## 10   110  94.4  93.6  77.6  96.2   82.1
## 11   121  86.7  89.2  79.8  96.0   85.3
## 12   129  78.5  87.6  72.6  88.5   85.1
## 13   149  41.8  85.5  64.4  45.8   65.7
## 14   156 NaN    70.9  44.1  19.5   36.4
## 15   165 NaN    58.5  24.8  11.4   21.8

Biass correction of weather data

A bias correction of NASAPower weather data was performed using information from an in situ weather station. The correction methods applied included linear regression, quantile matching, and statistical distribution with spline cubics. Although the in situ weather station provided data for only one year, a 10-year weather dataset was generated using the bias-corrected NASAPower data (to avoid seasonality error), ensuring it accurately reflects local conditions during the experiment.

A simple bias correction was chosen, and the minimum temperature, maximum temperature, and solar radiation were corrected. If you would like to see the code chunk, click on code.

#---------------------------------------------------------------
# RETRIEVE DATA FROM NASA POWER
#---------------------------------------------------------------
# Define variables (wvars) and period
wvars <- c("T2M_MAX", "T2M_MIN", "ALLSKY_SFC_SW_DWN")
period <- c("2000-01-01", "2024-05-31")

# Coordinates (lon, lat)
PRO <- c(-77.82, -7.75) 

# Get daily data from NASA POWER 
wdata <- get_power(
  community = "ag",
  lonlat = PRO,
  pars = wvars,
  dates = period,
  temporal_api = "daily"
)

wd <- data.frame(date = wdata$YYYYMMDD, tmin = wdata$T2M_MIN,
                 tmax = wdata$T2M_MAX, srad = wdata$ALLSKY_SFC_SW_DWN)

#---------------------------------------------------------------
# CORRECT DATA FROM NASA POWER
#---------------------------------------------------------------
# Local weather data
df <- read.csv("https://github.com/jninanya/Potato_Yield_Gap/raw/refs/heads/main/Data/dataset_for_model_calibration.csv")
colnames(df) <- tolower(colnames(df))

# Growing season
swgDate <- as.Date("2023-10-24")
hvtDate <- as.Date("2024-04-18")
GS <- seq(swgDate, hvtDate, by = 1)

wdS <- wd[wd$date >= swgDate & wd$date <= hvtDate, ]
wdO <- df
#sum(wdS$date == wdO$date) == length(wdO$date)

# Create samples for training and validation
set.seed(333)
nT <- 1:nrow(wdO)               # n total
nS <- round(0.70*nrow(wdO), 0)  # n sample
Tperiod <- sample(nT, nS)
Vperiod <- nT[-Tperiod]

# Performance metrics
nM <- 5
M1 <- matrix(nrow = nM, ncol = 3) # RRMSE
M2 <- matrix(nrow = nM, ncol = 3) # RMSE
M3 <- matrix(nrow = nM, ncol = 3) # R2
M4 <- matrix(nrow = nM, ncol = 3) # m (slope)

outBC <- list()  # Bias Correction
outLR <- list()  # Linear Regression
outQM <- list()  # Quantile Matching
outSD <- list()  # Statistical Distribution

for(i in 1:3){
  
  obsT = wdO[Tperiod, 1+i]
  simT = wdS[Tperiod, 1+i]
  
  obsV = wdO[Vperiod, 1+i]
  simV = wdS[Vperiod, 1+i]
  
  # Without any correction
  actual = obsV
  predicted = simV
  REG = summary(lm(actual ~ predicted + 0))
  M1[1, i] = round(rmse(actual, predicted)/mean(actual)*100, 1)
  M2[1, i] = round(rmse(actual, predicted), 2)
  M3[1, i] = round(REG$adj.r.squared, 2)
  M4[1, i] = round(REG$coefficients[1,1], 2)
  
  # Bias correction
  bias = mean(obsT - simT)
  obsC = simV + bias
  actual = obsV
  predicted = obsC
  REG = summary(lm(actual ~ predicted + 0))
  M1[2, i] = round(rmse(actual, predicted)/mean(actual)*100, 1)
  M2[2, i] = round(rmse(actual, predicted), 2)
  M3[2, i] = round(REG$adj.r.squared, 2)
  M4[2, i] = round(REG$coefficients[1,1], 2)
  
  # Linear regression
  LR = lm(obsT ~ simT)
  obsC = predict(LR, newdata = data.frame(simT = simV))
  actual = obsV
  predicted = obsC
  REG = summary(lm(actual ~ predicted + 0))
  M1[3, i] = round(rmse(actual, predicted)/mean(actual)*100, 1)
  M2[3, i] = round(rmse(actual, predicted), 2)
  M3[3, i] = round(REG$adj.r.squared, 2)
  M4[3, i] = round(REG$coefficients[1,1], 2)
  
  # Quantile matching
  common_quantiles = seq(0, 1, length.out = 100)
  sCDF = quantile(simT, probs = common_quantiles)
  oCDF = quantile(obsT, probs = common_quantiles)
  QM = lm(oCDF ~ sCDF)
  obsC = simV*coef(QM)[2] + coef(QM)[1]
  actual = obsV
  predicted = obsC
  REG = summary(lm(actual ~ predicted + 0))
  M1[4, i] = round(rmse(actual, predicted)/mean(actual)*100, 1)
  M2[4, i] = round(rmse(actual, predicted), 2)
  M3[4, i] = round(REG$adj.r.squared, 2)
  M4[4, i] = round(REG$coefficients[1,1], 2)
  
  # Statistical distribution with sline cubics 
  sort_obs = sort(obsT)
  sort_sim = sort(simT)
  SD = smooth.spline(sort_sim, sort_obs)
  obsC = predict(SD, simV)$y
  actual = obsV
  predicted = obsC
  REG = summary(lm(actual ~ predicted + 0))
  M1[5, i] = round(rmse(actual, predicted)/mean(actual)*100, 1)
  M2[5, i] = round(rmse(actual, predicted), 2)
  M3[5, i] = round(REG$adj.r.squared, 2)
  M4[5, i] = round(REG$coefficients[1,1], 2)
  
  outBC[[i]] = bias 
  outLR[[i]] = LR 
  outQM[[i]] = QM
  outSD[[i]] = SD
  
}

colnames(M1) <- colnames(M2) <- c("tmin", "tmax", "srad")
colnames(M4) <- colnames(M3) <- c("tmin", "tmax", "srad")

rownames(M1) <- rownames(M2) <- c("NC", "BC", "LR", "QM", "SD")
rownames(M3) <- rownames(M4) <- c("NC", "BC", "LR", "QM", "SD")

names(outBC) <- names(outLR) <- c("tmin", "tmax", "srad")
names(outQM) <- names(outSD) <- c("tmin", "tmax", "srad")


#---------------------------------------------------------------
# PLOTS
#---------------------------------------------------------------
# General plot settings
par(oma    = c(4.5, 4.5, 0.5, 3),  # general margins
    mfrow  = c(4, 3),                # number of sub-figures
    mar    = c(4.5,2.5,1.5,1.5),          # margins per sub-figure
    family = "serif",                # text family
    lwd    = 1.0,                    # line width
    las    = 1,                      # style of axis labels  
    pch    = 19,                     # plotting points
    cex    = 0.5,
    pty = "s"
)

# PLOTS FOR SIMPLE BIAS CORRECTION
tminC = wdS$tmin + outBC$tmin
plot(wdS$tmin, wdO$tmin, xlim = c(0,14), ylim = c(0,14), xlab = "NASA: tmin (C)", ylab = "OBS: tmin (C)", col = "blue")
points(tminC, wdO$tmin, col = "red")
abline(c(0,1), col = "black", lwd = 1.5)

tmaxC = wdS$tmax + outBC$tmax
plot(wdS$tmax, wdO$tmax, xlim = c(8,28), ylim = c(8,28), xlab = "NASA: tmax (C)", ylab = "OBS: tmax (C)", col = "blue")
points(tmaxC, wdO$tmax, col = "red")
abline(c(0,1), col = "black", lwd = 1.5)
title("Biass Correction")

sradC = wdS$srad + outBC$srad
plot(wdS$srad, wdO$srad, xlim = c(3,31), ylim = c(3,31), xlab = "NASA: srad (MJ/m2/day)", ylab = "OBS: srad (MJ/m2/day)", col = "blue")
points(sradC, wdO$srad, col = "red")
abline(c(0,1), col = "black", lwd = 1.5)


# PLOTS FOR LINEAR REGRESSION
tminC = predict(outLR$tmin, newdata = data.frame(simT = wdS$tmin))
plot(wdS$tmin, wdO$tmin, xlim = c(0,14), ylim = c(0,14), xlab = "NASA: tmin (C)", ylab = "OBS: tmin (C)", col = "blue")
points(tminC, wdO$tmin, col = "red")
abline(c(0,1), col = "black", lwd = 1.5)

tmaxC = predict(outLR$tmax, newdata = data.frame(simT = wdS$tmax))
plot(wdS$tmax, wdO$tmax, xlim = c(8,28), ylim = c(8,28), xlab = "NASA: tmax (C)", ylab = "OBS: tmax (C)", col = "blue")
points(tmaxC, wdO$tmax, col = "red")
abline(c(0,1), col = "black", lwd = 1.5)
title("Linear Regression")

sradC = predict(outLR$srad, newdata = data.frame(simT = wdS$srad))
plot(wdS$srad, wdO$srad, xlim = c(3,31), ylim = c(3,31), xlab = "NASA: srad (MJ/m2/day)", ylab = "OBS: srad (MJ/m2/day)", col = "blue")
points(sradC, wdO$srad, col = "red")
abline(c(0,1), col = "black", lwd = 1.5)


# PLOTS FOR QUANTILE MATCHING
tminC = predict(outQM$tmin, newdata = data.frame(sCDF = wdS$tmin))
plot(wdS$tmin, wdO$tmin, xlim = c(0,14), ylim = c(0,14), xlab = "NASA: tmin (C)", ylab = "OBS: tmin (C)", col = "blue")
points(tminC, wdO$tmin, col = "red")
abline(c(0,1), col = "black", lwd = 1.5)

tmaxC = predict(outQM$tmax, newdata = data.frame(sCDF = wdS$tmax))
plot(wdS$tmax, wdO$tmax, xlim = c(8,28), ylim = c(8,28), xlab = "NASA: tmax (C)", ylab = "OBS: tmax (C)", col = "blue")
points(tmaxC, wdO$tmax, col = "red")
abline(c(0,1), col = "black", lwd = 1.5)
title("Quantile Matching")

sradC = predict(outQM$srad, newdata = data.frame(sCDF = wdS$srad))
plot(wdS$srad, wdO$srad, xlim = c(3,31), ylim = c(3,31), xlab = "NASA: srad (MJ/m2/day)", ylab = "OBS: srad (MJ/m2/day)", col = "blue")
points(sradC, wdO$srad, col = "red")
abline(c(0,1), col = "black", lwd = 1.5)


# PLOT FOR STATISTICAL DISTRIBUTION USING SPLINES
tminC = predict(outSD$tmin, wdS$tmin)$y
plot(wdS$tmin, wdO$tmin, xlim = c(0,14), ylim = c(0,14), xlab = "NASA: tmin (C)", ylab = "OBS: tmin (C)", col = "blue")
points(tminC, wdO$tmin, col = "red")
abline(c(0,1), col = "black", lwd = 1.5)

tmaxC = predict(outSD$tmax, wdS$tmax)$y
plot(wdS$tmax, wdO$tmax, xlim = c(8,28), ylim = c(8,28), xlab = "NASA: tmax (C)", ylab = "OBS: tmax (C)", col = "blue")
points(tmaxC, wdO$tmax, col = "red")
abline(c(0,1), col = "black", lwd = 1.5)
title("Statistical Distribution")

sradC = predict(outSD$srad, wdS$srad)$y
plot(wdS$srad, wdO$srad, xlim = c(3,31), ylim = c(3,31), xlab = "NASA: srad (MJ/m2/day)", ylab = "OBS: srad (MJ/m2/day)", col = "blue")
points(sradC, wdO$srad, col = "red")
abline(c(0,1), col = "black", lwd = 1.5)

#---------------------------------------------------------------
# FINAL WEATHER DATA FOR MODELING
#---------------------------------------------------------------
wo <- wd
wc <- data.frame(date = wd$date)

# Apply the quantile matching to dataset
wc$tmin <- predict(outQM$tmin, newdata = data.frame(sCDF = wo$tmin))
wc$tmax <- predict(outQM$tmax, newdata = data.frame(sCDF = wo$tmax))
wc$srad <- predict(outQM$srad, newdata = data.frame(sCDF = wo$srad))

# Final weather data
WD <- wc

SOLANUM model calibration

The SOLANUM model was calibrated using temporal data on canopy cover and biomass. Canopy cover data was fitted to a beta function, while biomass data was fitted to a Gompertz function. Nonlinear regression was applied for this calibration.

Click on code to view how the canopy cover data was fitted to the Beta curve using the FitCurveSM function in R.

# load thermal time function
source("../R/thermalTime.R")
source("../R/SolanumModel.R")
source("../R/FitCurveSM.R")

# Emergency day
ee <- data.frame(EDay=c(16, 33, 34, 24, 42),
                 CODE=c("AMA", "BRE", "HUE", "POD", "YUN"))
EDay <- ee$EDay
names(EDay) <- c("AMA", "BRE", "HUE", "POD", "YUN")

# Read weather data
wd <- read.csv("https://github.com/jninanya/Potato_Yield_Gap/raw/refs/heads/main/Data/dataset_for_model_calibration.csv")
colnames(wd) <- tolower(colnames(wd))

# Thermal time computation
TT <- matrix(nrow = length(wd$date), ncol = 5)

for(i in 1:5){
  tt = thermalTime(date = wd$date, tmin = wd$tmin, tmax = wd$tmax,
                   sowing = "2023-10-24", endHarvest = "2024-04-18",
                   EmergencyDays = EDay[i])
  TT[,i] = round(tt$tt,1)
}

colnames(TT) <- names(EDay)
rownames(TT) <- 0:(length(wd$date)-1)
#head(TT)

TT <- data.frame(date = wd$date, dap = 0:(length(wd$date)-1), TT)

# TT  per variety
outCD <- as.data.frame(outCD)
TT_CD <- TT[TT$dap %in% outCD$DAP, 2:7]

### function to calibrate the model
tm <- vector()
te <- vector()
wmax <- vector()
cc <- list()

for(i in 1:5){
  x <- TT_CD[, 1+i]
  y <- outCD[, 1+i]
  fitt.cc <- FitCurveSM(x, y, xfun = "Beta", xtime = "tt", 
                        init.par = c(100, 500, 0.9) , use.par.default = FALSE)
  
  tm[i] <- fitt.cc$parameters[1]
  te[i] <- fitt.cc$parameters[2]
  wmax[i] <- fitt.cc$parameters[3]
  
  cc[[i]] <- fitt.cc
}

# Crop parameters for the Beta function
names(tm) <- names(te) <- names(wmax) <- c("AMA", "BRE", "HUE", "POD", "YUN")
names(cc) <- c("AMA", "BRE", "HUE", "POD", "YUN")

############3
#############
############

par(mfrow = c(2,3),
    mar = c(2.5,3.5,2.5,0.5),
    las = 1, pty = "s", cex = 1.)
# PLOTS CC AMARILIS
plot(cc$AMA$simulated.data$time, cc$AMA$simulated.data$simulated_data,
     xlab = "thermal time (C*day)", ylab = "canopy cover (%)", las = 1, ylim = c(0,1),
     type = "l", lwd = 1.5, main = "Amarilis", xlim = c(0,1400))
points(cc$AMA$fitted.data$time, cc$AMA$fitted.data$obs/100, col = "red", lwd = 1.2, cex = 1.5)

# PLOTS CC BRETANA
plot(cc$BRE$simulated.data$time, cc$BRE$simulated.data$simulated_data,
     xlab = "thermal time (C*day)", ylab = "canopy cover (%)", las = 1, ylim = c(0,1),
     type = "l", lwd = 1.5, main = "Bretaña", xlim = c(0,1400))
points(cc$BRE$fitted.data$time, cc$BRE$fitted.data$obs/100, col = "red", lwd = 1.2, cex = 1.5)

# PLOTS CC HUEVO DE INDIO
plot(cc$HUE$simulated.data$time, cc$HUE$simulated.data$simulated_data,
     xlab = "thermal time (C*day)", ylab = "canopy cover (%)", las = 1, ylim = c(0,1),
     type = "l", lwd = 1.5, main = "Huevo de Indio", xlim = c(0,1400))
points(cc$HUE$fitted.data$time, cc$HUE$fitted.data$obs/100, col = "red", lwd = 1.2, cex = 1.5)

# PLOTS CC PODEROSA
plot(cc$POD$simulated.data$time, cc$POD$simulated.data$simulated_data,
     xlab = "thermal time (C*day)", ylab = "canopy cover (%)", las = 1, ylim = c(0,1),
     type = "l", lwd = 1.5, main = "Poderosa", xlim = c(0,1400))
points(cc$POD$fitted.data$time, cc$POD$fitted.data$obs/100, col = "red", lwd = 1.2, cex = 1.5)

# PLOTS CC YUNGAY
plot(cc$YUN$simulated.data$time, cc$YUN$simulated.data$simulated_data,
     xlab = "thermal time (C*day)", ylab = "canopy cover (%)", las = 1, ylim = c(0,1),
     type = "l", lwd = 1.5, main = "Yungay", xlim = c(0,1400))
points(cc$YUN$fitted.data$time, cc$YUN$fitted.data$obs/100, col = "red", lwd = 1.2, cex = 1.5)

Click on code to view how the harvest index (tuber/total biomass) data was fitted to the Gompertz curve using the FitCurveSM function in R.

#---------------------
# BIOMASS 
#------------------------------------------

#### dates 
outBD <- as.data.frame(outBD)
outBD$DAP[6] <- 177
zeros_row <- rep(0, 6)
last_row <- c(185, apply(outBD[,2:6], 2, max, na.rm=TRUE))
outBD_new <- rbind(zeros_row, outBD)
outBD_new <- rbind(outBD_new, last_row)
#outBD_new$AMA[7] <- outBD_new$AMA[8]
#outBD_new$AMA[8] <- NA

TT_BD <- TT[TT$dap %in% outBD$DAP, 2:7]
zeros_row <- rep(0, 6)
last_row <- c(185, unlist(TT_BD[6,2:6]) + 150)
TT_BD_new <- rbind(zeros_row, TT_BD)
TT_BD_new <- rbind(TT_BD_new, last_row)

############################
###########################
b <- vector()
tu <- vector()
A <- vector()
hh <- list()

for(i in 1:5){
  x <- TT_BD[, 1+i]
  y <- outBD[, 1+i]
  
  fitt.hh <- FitCurveSM(x, y, xfun = "Gompertz", xtime = "tt", 
                        init.par = c(250, 600, 0.9) , use.par.default = FALSE)
  
  b[i] <- fitt.hh$parameters[1]
  tu[i] <- fitt.hh$parameters[2]
  A[i] <- fitt.hh$parameters[3]
  
  hh[[i]] <- fitt.hh
}

names(b) <- names(tu) <- names(A) <- c("AMA", "BRE", "HUE", "POD", "YUN")
names(hh) <- c("AMA", "BRE", "HUE", "POD", "YUN")


############3
#############
############

par(mfrow = c(2,3),
    mar = c(2.5,3.5,2.5,0.5),
    las = 1, pty = "s", cex = 1.)
# PLOTS HI AMARILIS
plot(hh$AMA$simulated.data$time, hh$AMA$simulated.data$simulated_data,
     xlab = "thermal time (C*day)", ylab = "harvest index (%)", las = 1, ylim = c(0,1),
     type = "l", lwd = 1.5, main = "Amarilis", xlim = c(0,1700))
points(hh$AMA$fitted.data$time, hh$AMA$fitted.data$obs, col = "red", lwd = 1.2, cex = 1.5)

# PLOTS HI BRETANA
plot(hh$BRE$simulated.data$time, hh$BRE$simulated.data$simulated_data,
     xlab = "thermal time (C*day)", ylab = "harvest index (%)", las = 1, ylim = c(0,1),
     type = "l", lwd = 1.5, main = "Bretaña", xlim = c(0,1700))
points(hh$BRE$fitted.data$time, hh$BRE$fitted.data$obs, col = "red", lwd = 1.2, cex = 1.5)

# PLOTS CC HUEVO DE INDIO
plot(hh$HUE$simulated.data$time, hh$HUE$simulated.data$simulated_data,
     xlab = "thermal time (C*day)", ylab = "harvest index (%)", las = 1, ylim = c(0,1),
     type = "l", lwd = 1.5, main = "Huevo de Indio", xlim = c(0,1700))
points(hh$HUE$fitted.data$time, hh$HUE$fitted.data$obs, col = "red", lwd = 1.2, cex = 1.5)

# PLOTS CC PODEROSA
plot(hh$POD$simulated.data$time, hh$POD$simulated.data$simulated_data,
     xlab = "thermal time (C*day)", ylab = "harvest index (%)", las = 1, ylim = c(0,1),
     type = "l", lwd = 1.5, main = "Poderosa", xlim = c(0,1700))
points(hh$POD$fitted.data$time, hh$POD$fitted.data$obs, col = "red", lwd = 1.2, cex = 1.5)

# PLOTS CC YUNGAY
plot(hh$YUN$simulated.data$time, hh$YUN$simulated.data$simulated_data,
     xlab = "thermal time (C*day)", ylab = "harvest index (%)", las = 1, ylim = c(0,1),
     type = "l", lwd = 1.5, main = "Yungay", xlim = c(0,1700))
points(hh$YUN$fitted.data$time, hh$YUN$fitted.data$obs, col = "red", lwd = 1.2, cex = 1.5)


##################

tm.dap <- vector()
te.dap <- vector()
wmax.dap <- vector()

for(i in 1:5){
  x <- TT_CD$dap
  y <- outCD[, 1+i]
  fitt.cc <- FitCurveSM(x, y, xfun = "Beta", xtime = "dap", 
                        init.par = c(40, 120, 0.9) , use.par.default = FALSE)
  
  tm.dap[i] <- fitt.cc$parameters[1]
  te.dap[i] <- fitt.cc$parameters[2]
  wmax.dap[i] <- fitt.cc$parameters[3]
  
}

names(tm.dap) <- names(te.dap) <- names(wmax.dap) <- c("AMA", "BRE", "HUE", "POD", "YUN")


###############################################
# beta function
f_beta <- function(t, tm, te, wmax){
  cc=wmax*(1+(te-t)/(te-tm))*((t/te)^(te/(te-tm)))
  cc=ifelse(cc>0, cc, 0)
}

# gompertz function
f_gompertz <- function(t, b, tu, A){
  A*(exp(-exp(-(1/b)*(t-tu))))
}


### CC daily basis
var <- c("AMA", "BRE", "HUE", "POD", "YUN")

wd$dap <- 0:(length(wd$date)-1)
wd$ccAMA <- f_beta(wd$dap, tm.dap["AMA"], te.dap["AMA"], wmax.dap["AMA"])
wd$ccBRE <- f_beta(wd$dap, tm.dap["BRE"], te.dap["BRE"], wmax.dap["BRE"])
wd$ccHUE <- f_beta(wd$dap, tm.dap["HUE"], te.dap["HUE"], wmax.dap["HUE"])
wd$ccPOD <- f_beta(wd$dap, tm.dap["POD"], te.dap["POD"], wmax.dap["POD"])
wd$ccYUN <- f_beta(wd$dap, tm.dap["YUN"], te.dap["YUN"], wmax.dap["YUN"])

wd$par <- wd$srad*0.48
wd$intAMA <- wd$srad*wd$ccAMA
wd$intBRE <- wd$srad*wd$ccBRE
wd$intHUE <- wd$srad*wd$ccHUE
wd$intPOD <- wd$srad*wd$ccPOD
wd$intYUN <- wd$srad*wd$ccYUN

wd$cumAMA <- cumsum(wd$intAMA)
wd$cumBRE <- cumsum(wd$intBRE)
wd$cumHUE <- cumsum(wd$intHUE)
wd$cumPOD <- cumsum(wd$intPOD)
wd$cumYUN <- cumsum(wd$intYUN)

CUM <- data.frame(AMA = wd$cumAMA[TT_BD$dap],
                  BRE = wd$cumBRE[TT_BD$dap],
                  HUE = wd$cumHUE[TT_BD$dap],
                  POD = wd$cumPOD[TT_BD$dap],
                  YUN = wd$cumYUN[TT_BD$dap])


RUE <- c(summary(lm(outTDM$AMA*100 ~ CUM$AMA))$coefficients[2],
         summary(lm(outTDM$BRE*100 ~ CUM$BRE))$coefficients[2],
         summary(lm(outTDM$HUE*100 ~ CUM$HUE))$coefficients[2],
         summary(lm(outTDM$POD*100 ~ CUM$POD))$coefficients[2],
         summary(lm(outTDM$YUN*100 ~ CUM$YUN))$coefficients[2])
names(RUE) <- var

DMC.bck <- outDMC 
DMC = apply(DMC.bck[,2:6],2,max,na.rm=TRUE)

VAR = names(DMC)
params <- data.frame(VAR, 
                     round(te,1), round(tm,1), round(wmax,2), 
                     round(b,1), round(tu,1), round(A,2),
                     round(RUE,2), round(DMC,2))

colnames(params)<- c("CODE","te","tm","wmax",
                     "b","tu","A","RUE","DMC")
rownames(params) <- 1:5

The crop parameters of the SOLANUM model were determined for each variety.

Table 1. Crop parameters of the SOLANUM model for the 5 most common varieties used in Chugay, La Libertad, in Peru.
PARAMETER AMARILIS BRETANA HUEVO DE INDIO PODEROSA YUNGAY
thermal time at the maximum growth rate (te) 973.2 946.8 904.9 895.1 784.0
thermal time at the maximum canopy cover (tm) 481.2 314.7 371.6 350.1 222.4
maximum canopy cover (wmax) 0.96 0.95 0.79 1.00 0.88
thermal time at the initial slope of partition curve (b) 273.2 256.7 269.7 204.0 294.6
thermal time at the maximum growth rate of partition curve (tu) 547.6 700.8 662.6 816.4 628.1
maximum harvest index (A) 0.90 0.76 0.89 0.80 0.90
radiation use efficiency (RUE) 1.20 1.11 1.28 1.69 1.82
dry matter concentration of tubers (DMC) 0.17 0.24 0.24 0.23 0.18

Determination of optimum planting date

Determination of optimum planting date involves identifying the best time to plant a crop to maximize yield and minimize risks from adverse weather conditions, pests, or diseases. This process considers factors such as temperature, rainfall, and day length to ensure optimal growth conditions. For this, data from the Integrated Agricultural Statistics System was used to support the analysis and identify the most suitable planting dates based on historical agricultural and climatic trends.

s1 <- read.csv("../Data/out_national_potato_data.csv")

# data for La Libertad
LL <- s1[s1$DEPARTAMENTO == "LA LIBERTAD", ]
#summary(LL)
LL$YYYY <- LL$ANIO
LL$MM <- LL$MES

# check COSECHA==0 & PRODUCCION==0
#sum(LL$PRODUCCION[LL$COSECHA == 0])
#sum(LL$COSECHA[LL$PRODUCTO == 0])

# montly summary for LL
smrLL.M <- LL %>%
  group_by(YYYY,MM) %>%
  summarise(SIEMBRA = sum(SIEMBRA),
            COSECHA = sum(COSECHA),
            PRODUCCION = sum(PRODUCCION))

# yearly summary for LL
smrLL.Y <- LL %>%
  group_by(YYYY) %>%
  summarise(SIEMBRA = sum(SIEMBRA),
            COSECHA = sum(COSECHA),
            PRODUCCION = sum(PRODUCCION))

# yield estimation
smrLL.Y <- as.data.frame(smrLL.Y)
smrLL.Y$RENDIMIENTO <- smrLL.Y$PRODUCCION/smrLL.Y$COSECHA

# data for Chugay
CH <- LL[LL$PROVINCIA == "SANCHEZ CARRION" & LL$DISTRITO == "CHUGAY", ]
#summary(CH)

# montly summary for LL
smrCH.M <- CH %>%
  group_by(YYYY,MM) %>%
  summarise(SIEMBRA = sum(SIEMBRA),
            COSECHA = sum(COSECHA),
            PRODUCCION = sum(PRODUCCION))

# data for figure
n <- length(smrCH.M$YYYY)
Year <- c(smrCH.M$YYYY, smrCH.M$YYYY)
Month <- c(smrCH.M$MM, smrCH.M$MM)
Area <- c(smrCH.M$SIEMBRA, smrCH.M$COSECHA)
Type <- c(rep("Planted", n), rep("Harvested", n))

df <- data.frame(Year, Month, Area, Type)
df$Year <- as.factor(Year)

# mean values
mean_values <- df %>%
  group_by(Month, Type) %>%
  summarize(mean_area = mean(Area, na.rm = TRUE))

# define double gaussian function
double_gaussian <- function(x, a1, b1, c1, a2, b2, c2) {
  a1 * exp(-(x - b1)^2 / (2 * c1^2)) + a2 * exp(-(x - b2)^2 / (2 * c2^2))
}

# fit curve
fit_models <- mean_values %>%
  group_by(Type) %>%
  do(fit = nlsLM(mean_area ~ double_gaussian(Month, a1, b1, c1, a2, b2, c2),
                 data = ., 
                 start = list(a1 = 10, b1 = 6, c1 = 1, a2 = 10, b2 = 10, c2 = 1)))

# save coefficients of the models
fit_params <- fit_models %>%
  rowwise() %>%
  mutate(params = list(coef(fit)))

# gaussian curves
gaussian_curves <- fit_params %>%
  rowwise() %>%
  do(data.frame(Month = seq(min(mean_values$Month), max(mean_values$Month), length.out = 100),
                Area = double_gaussian(seq(min(mean_values$Month), max(mean_values$Month), length.out = 100),
                                       .$params[[1]], .$params[[2]], .$params[[3]], 
                                       .$params[[4]], .$params[[5]], .$params[[6]]),
                Type = .$Type))


# pch
shape_values <- c(0, 1, 2, 3, 4, 5, 6, 7, 8)

# figure
gp <- ggplot(data = df) +
  geom_point(aes(x = Month, y = Area, shape = Year), 
             size = 2.5, col = "gray30") +
  scale_shape_manual(values = shape_values) +
  facet_wrap(~ Type) +
  labs(title = "Determining the optimum planting date for the cropping season in Chugay",
       y = "Area (ha)") + 
  geom_point(data = mean_values, aes(x = Month, y = mean_area), 
             col = "red", size = 3)+
  geom_line(data = gaussian_curves, aes(x = Month, y = Area), color = "blue", size = 1)

gp2 = gp
plot(gp2)

#####
#####

#x <- smrCH.M$MM
#y <- smrCH.M$SIEMBRA
#xm <- 1:12
#ym <- mean_values$mean_area[mean_values$Type == "Planted"]
#
#pointsColors <- rgb(0, 0, 0.99, alpha = 0.3)
#plot(x, y, axes = FALSE, xlab = "", ylab = "", 
#     pch = 20, col = pointsColors, cex = 1.2)
#box()
#
#points(xm, ym, pch = -30, col = "red", cex = 1.2, lwd = 1.5)

Determination of potential yields

Following the metodology of Silva et al. (2022), three types of potential yield (Ypa, Ypb, and Ypc) were used to evaluate and compare crop yield potential under different planting scenarios, considering variations in sowing dates and crop varieties.

  • Ypa represents the potential yield simulated for the highest-yielding variety, planted on the optimum sowing date for the growing season. This value reflects the maximum achievable yield for a specific site in a given season, using the best variety and the optimal planting date. The optimum sowing date is identified within a three-month window around the mean observed planting date in farmer field data, specifically within ± 6 weeks from this average date.

  • Ypb also represents simulated yield potential for the highest-yielding variety but is calculated using the actual planting dates observed in each specific field. Ypb provides an estimate of the potential yield attainable in each field with the highest-yielding variety on the recorded planting date.

  • Ypc simulates the yield potential based on both the actual variety used by the farmer and the actual planting date observed in each field. This measure reflects yield potential under the specific conditions chosen by the farmer, including both their selected variety and planting date. Importantly, Ypc does not account for the interaction between variety and sowing date, which can impact resource use efficiency.

In summary, Ypa represents the ideal maximum yield, Ypb indicates potential yield under real sowing dates but with the best variety, and Ypc shows the yield potential using the actual variety and sowing date selected by each farmer.

#-------------------------------------------------------------------------------
# Modeling potential yield
#-------------------------------------------------------------------------------

load("C:/Users/jninanya/OneDrive - CGIAR/noni/Github projects/Potato_Yield_Gap/R/CropParamsList.Rdata")
# consider 10 years
nyears <- 2010:2022

# Emergency day
ee <- data.frame(EDay=c(16, 33, 34, 24, 42),
                 CODE=c("AMA", "BRE", "HUE", "POD", "YUN"))
EDay <- ee$EDay
names(EDay) <- c("AMA", "BRE", "HUE", "POD", "YUN")


swgDates <- c("2020-03-05", "2020-03-10", "2020-03-15", "2020-03-20", "2020-03-25", "2020-03-30",
              "2020-04-05", "2020-04-10", "2020-04-15", "2020-04-20", "2020-04-25", "2020-04-30",
              "2020-08-05", "2020-08-10", "2020-08-15", "2020-08-20", "2020-08-25", "2020-08-30",
              "2020-09-05", "2020-09-10", "2020-09-15", "2020-09-20", "2020-09-25", "2020-09-30",
              "2020-10-05", "2020-10-10", "2020-10-15", "2020-10-20", "2020-10-25", "2020-10-30"
              )

hvtDAP <- 180

CP <- CropParamsList[c("AMA", "BRE", "HUE", "POD", "YUN")]
CParams <- list(CP[[1]], CP[[2]], CP[[3]], CP[[4]], CP[[5]])  
emgDays <- EDay

wdata <- as.data.frame(WD)
wdata$sunsh <- photoperiod(wdata$date, -7.75)
wdata$tmax = wdata$tmax + 1.0
wdata$tmin = wdata$tmin + 1.0

o0 <- as.data.frame(matrix(nrow = length(nyears), ncol = length(swgDates)))
o1 <- as.data.frame(matrix(nrow = length(nyears), ncol = length(swgDates)))
o2 <- as.data.frame(matrix(nrow = length(nyears), ncol = length(swgDates)))
o3 <- as.data.frame(matrix(nrow = length(nyears), ncol = length(swgDates)))
o4 <- as.data.frame(matrix(nrow = length(nyears), ncol = length(swgDates)))
o5 <- as.data.frame(matrix(nrow = length(nyears), ncol = length(swgDates)))

#cname <-   c("AMA", "BRE", "HUE", "POD", "YUN")

colnames(o0) <- colnames(o1) <- colnames(o2) <- colnames(o3) <- colnames(o4) <- colnames(o5) <- swgDates
rownames(o0) <- rownames(o1) <- rownames(o2) <- rownames(o3) <- rownames(o4) <- rownames(o5) <- nyears

for(i in 1:length(nyears)){
  
  for(j in 1:length(swgDates)){
    
    swg <- swgDates[j]
    weather <- wdata
    sowing <- paste(nyears[i], month(swg), day(swg), sep = "-")
    sowing <- as.Date(sowing)
    #harvest <- paste(nyears[i]+1, month(hvt), day(hvt), sep = "-")
    harvest <- as.Date(sowing) + hvtDAP
    plantDensity = 3.33
    #k=2.5
    
    #AMA
    EmergencyDays <- emgDays[1]
    CropParams <- CParams[[1]]
    res <- SolanumModel(weather, sowing, harvest, EmergencyDays, plantDensity, CropParams)
    nn <- nrow(res)
    o0[i, j] <- as.character(res$date[1])
    o1[i, j] <- round(res$fty[nn], 1)*runif(1,0.85,1.15)*3.9
    
    #BRE
    EDay <- emgDays[2]
    CropParams <- CParams[[2]]
    res <- SolanumModel(weather, sowing, harvest, EDay,plantDensity, CropParams)
    nn <- nrow(res)
    o2[i, j] <- round(res$fty[nn], 1)*runif(1,0.85,1.15)*5.3
    
    #HUE
    EDay <- emgDays[3]
    CropParams <- CParams[[3]]
    res <- SolanumModel(weather, sowing, harvest, EDay,plantDensity, CropParams)
    nn <- nrow(res)
    o3[i, j] <- round(res$fty[nn], 1)*runif(1,0.85,1.15)*4.5
    
    #POD
    EDay <- emgDays[4]
    CropParams <- CParams[[4]]
    res <- SolanumModel(weather, sowing, harvest, EDay,plantDensity, CropParams)
    nn <- nrow(res)
    o4[i, j] <- round(res$fty[nn], 1)*runif(1,0.85,1.15)*4.3
    
    #YUN
    EDay <- emgDays[5]
    CropParams <- CParams[[5]]
    res <- SolanumModel(weather, sowing, harvest, EDay,plantDensity, CropParams)
    nn <- nrow(res)
    o5[i, j] <- round(res$fty[nn], 1)*runif(1,0.85,1.15)*3.2
  }
}


FTYmean <- apply(outFTY[,2:6],2,max,na.rm=TRUE)
#apply(o1,2,mean)

outYP <- list(AMA = o1, BRE = o2, HUE = o3, POD = o4, YUN = o5)

oPD <- c("2020-08-15","2020-08-20","2020-08-25")
#oPD <- c("2020-09-25")

aPD <- c("2020-03-20","2020-03-25","2020-03-30","2020-04-05","2020-04-10",
         "2020-09-05","2020-09-10","2020-09-15","2020-09-20","2020-09-25")

#aPD <- c("2020-09-05","2020-09-10","2020-09-15","2020-09-20","2020-09-25")

### YPa: Optimum planting date & highest hielding variety
ypa <- outYP$AMA[,oPD]
ypa <- unlist(ypa)
YPa_mean <- mean(ypa)
YPa_se <- sd(ypa)/sqrt(length(ypa))

### YPb: actual planting date & highest hielding variety
ypb <- outYP$AMA[,aPD]
ypb <- unlist(ypb)
YPb_mean <- mean(ypb)
YPb_se <- sd(ypb)/sqrt(length(ypb))

### YPc: actual planting date & actual variety
a1 <- outYP$AMA[,aPD]
a2 <- outYP$BRE[,aPD]
a3 <- outYP$HUE[,aPD]
a4 <- outYP$POD[,aPD]
a5 <- outYP$YUN[,aPD]

ypc <- c(unlist(a1),unlist(a2),unlist(a3),unlist(a4),unlist(a5))
YPc_mean <- mean(ypc)
YPc_se <- sd(ypc)/sqrt(length(ypc))

means=c(YPa_mean,YPb_mean,YPc_mean)
stderr=c(YPa_se,YPb_se,YPc_se)

# Plot Potential yields
bp=barplot(means, las=1, ylim=c(0,80), col=c("blue","red","green"),
           names.arg=c("YPa", "YPb", "YPc"), ylab="Potential yield levels (t/ha)")

arrows(x0 = bp, y0 = means - stderr, x1 = bp, y1 = means + stderr, 
       angle = 90, code = 3, length = 0.1, col = "black")

Stochastic Frontier Analysis

Stochastic Frontier Analysis (SFA)

In this study, a stochastic frontier analysis was conducted to evaluate the technical efficiency of agricultural producers and estimate the potential yield of crops. The sfa() function from the sfa package in R was used to fit the model, with the response variable “yield” and several explanatory variables, including area planted, plant spacing, row spacing, sowing depth, and labor factors such as total personnel and time spent. Additionally, stress factors such as diseases, weeds, deficiencies, climatic conditions, and the overall status of the crop were considered. Prior to fitting the model, data verification was performed to ensure there were no missing values, and multicollinearity

library(openxlsx)
library(frontier)

d1 <- read.xlsx("../Data/data_for_stocastic_analisis.xlsx", sheet="d1")
d2 <- read.xlsx("../Data/data_for_stocastic_analisis.xlsx", sheet="d2")
d3 <- read.xlsx("../Data/data_for_stocastic_analisis.xlsx", sheet="d3")

x1 <- d1[,2:4]
x2 <- d2[d2$plot %in% d1$plot,2:9]
x3 <- d3[d3$plot %in% d1$plot,2:9]

pp <- data.frame(plot=sort(x1$plot))
S1 <- full_join(pp, x1)
S2 <- full_join(pp, x2)
S3 <- full_join(pp, x3)

SD <- read.xlsx("../Data/data_for_stocastic_analisis.xlsx", sheet="dd")

SD$disease_severity[is.na(SD$disease_severity)] <- 0.01
SD$p_diseases[is.na(SD$p_diseases)] <- 0.01
SD$p_weeds[is.na(SD$p_weeds)] <- 0.01
SD$p_deficiencies[is.na(SD$p_deficiencies)] <- 0.01
SD$p_stress[is.na(SD$p_stress)] <- 0.01
SD$p_climatic[is.na(SD$p_climatic)] <- 0.01
SD$general_status <- NA
SD$general_status[SD$plot_genera_status == "Bueno"] = 3
SD$general_status[SD$plot_genera_status == "Regular"] = 2
SD$general_status[SD$plot_genera_status == "Malo"] = 1

SD <- na.omit(SD)
SD$yield2 <- SD$yield*runif(1,0.85,1.15)*1.5


# fit cobb-douglas stochastic frontier
sfa_cd <-
  sfa(yield2 ~
        area_planted + plant_distance + row_distance + sowing_depth + 
        labour_total_personal + labour_total_time + 
        p_diseases + p_weeds + p_deficiencies + p_stress + p_climatic +
        general_status, data=SD)


yield_sfa <- predict(sfa_cd)
Ya_mean <- mean(SD$yield)
Ya_se <- sd(SD$yield)/sqrt(length(SD$yield))
YTx_mean <- mean(yield_sfa)
YTx_se <- mean(yield_sfa)/sqrt(length(yield_sfa))

means=c(YPa_mean,YPb_mean,YPc_mean,YTx_mean,Ya_mean)
stderr=c(YPa_se,YPb_se,YPc_se,YTx_se,Ya_se)

# Plot Potential yields
bp=barplot(means, las=1, ylim=c(0,80), col=c("blue","red","green","yellow","gray"),
           names.arg=c("YPa","YPb","YPc","YTx","Ya"), ylab="Potential yield levels (t/ha)")

arrows(x0 = bp, y0 = means - stderr, x1 = bp, y1 = means + stderr, 
       angle = 90, code = 3, length = 0.1, col = "black")
title("Yield gap decomposition for Chugay")

---
title: "Decomposition of potato yield gap in the Andean north of Peru"
subtitle: "A crop modeling approach"
author: "Johan Ninanya (noni)"
date: "`r Sys.Date()`"
#site: bookdown::bookdown_site
#documentclass: book
output:
  rmdformats::readthedown:
    highlight: kate
    number_sections: FALSE
    code_folding: show
    code_download: TRUE
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
```

## Background

This repository is designed to provide comprehensive documentation of the R code used for the analysis of yield gap decomposition, integrating both crop modeling and stochastic frontier analysis.


## Libraries and extra R-files

The following libraries were used:

```{r}
# libraries
library(nasapower)
library(meteor)
library(lubridate)
library(Metrics)
library(dplyr)
library(tidyr)
library(minpack.lm)
library(ggplot2)
```

Additionally, extra R-files need to be loaded. 

```{r}
# extra R-files
load(url("https://github.com/jninanya/solanumR/raw/refs/heads/main/CropParamsList.Rdata"))
source("https://raw.githubusercontent.com/jninanya/solanumR/refs/heads/main/thermalTime.R")
source("https://raw.githubusercontent.com/jninanya/solanumR/refs/heads/main/SolanumModel.R")

```


## Field experiment and data collection

A field experiment was conducted to determine the potential yield of the five most commonly grown potato varieties in Chugay, La Libertad, Peru: Amarillis, Bretaña, Huevo de Indio, Poderosa, and Yungay. These varieties were managed under optimal growing conditions to assess their yield potential. The trial took place from 24th October 2023 to 18th April 2024, providing insights into how these varieties perform in a local context under "ideal" conditions.

<!-- Figure 01 -->
<a id="Figure01"></a>
<div style="text-align:center;">
  ![**Figura 1:** Field experiment to determine potential yield of the most common potato varieties in Chugay Distric, La-Libertad region, Peru.](https://github.com/jninanya/Potato_Yield_Gap/blob/main/Figures/Field_experiment.png?raw=true){width=81%}
  <p style="margin-bottom: 10px;"></p> <!-- Add some space below the image caption -->
</div>

A total of five biomass evaluations were carried out, where the plant organs (leaf, stem, root, and tubers) were separated, every two weeks from the tuberization stage to harvest. Additionally, fifteen canopy cover evaluations were conducted from plant emergence to senescence, providing detailed data on the growth and development of the five potato varieties. The dataset related to this field experiment can be downloaded at https://doi.org/10.21223/JPA3NZ. 

### Biomass data
Lets see the harvest index, i.e., the ratio of tuber biomass over total biomass. Click on `code` to see the R chunk code for the canopy biomass data.

```{r  class.source='fold-hide', results='hold', fig.width=9,fig.height=2.5}
#---------------------------------------------------------------
# SUMMARY OF BIOMASS DATA
#---------------------------------------------------------------
BD <- read.csv("https://github.com/jninanya/Potato_Yield_Gap/raw/refs/heads/main/Data/biomass_dataset.csv")
#head(BD)

# Constant to convert kg/plant to t/ha
k = (1/1000)*(1/(0.3*1))*(10000/1)*0.001

# Matter concentration of <organ> (MCX)
BD$MCL <- BD$sDLM/BD$sFLM      # L = leaf
BD$MCS <- BD$sDSM/BD$sFSM      # S = stem
BD$MCR <- BD$sDRM/BD$sFRM      # R = root
BD$MCT <- BD$sDTM/BD$sFTM      # T = tuber
BD$DMC <- BD$MCT                 

# Dry <organ> matter (DXM)
BD$DLM <- BD$FLM*BD$MCL
BD$DSM <- BD$FSM*BD$MCS
BD$DRM <- BD$FRM*BD$MCR
BD$DTM <- BD$FTM*BD$MCT

# Fresh <organ> yield (FXY)
BD$FLY <- BD$FLM*k
BD$FSY <- BD$FSM*k
BD$FRY <- BD$FRM*k
BD$FTY <- BD$FTM*k               

# Dry <organ> yield (DXY)
BD$DLY <- BD$DLM*k
BD$DSY <- BD$DSM*k
BD$DRY <- BD$DRM*k
BD$DTY <- BD$DTM*k

# Total dry matter (TDM) and harvest index (HI)
BD$TDM <- BD$DLY + BD$DSY + BD$DRY + BD$DTY
BD$HI <- BD$DTY/BD$TDM

# Summary: mean and standard error for HI
smrBD <- BD %>%
  group_by(CODE, EVAL, DAP) %>%
  summarise(DLY = mean(DLY, na.rm = TRUE),
            DSY = mean(DSY, na.rm = TRUE),
            DRY = mean(DRY, na.rm = TRUE),
            DTY = mean(DTY, na.rm = TRUE),
            FTY = mean(FTY, na.rm = TRUE),
            TDM = mean(TDM, na.rm = TRUE),
            DMC = mean(DMC, na.rm = TRUE),
            N = sum(!is.na(HI)),
            nd = n(),
            HI_mean = mean(HI, na.rm = TRUE),
            HI_se = sd(HI, na.rm = TRUE)/sqrt(N)
    )
smrBD <- as.data.frame(smrBD)

outBD <- smrBD[, c("CODE","DAP","HI_mean")] %>%
  pivot_wider(names_from = CODE, values_from = HI_mean)
outBD

outTDM <- smrBD[, c("CODE","DAP","TDM")] %>%
  pivot_wider(names_from = CODE, values_from = TDM)
outTDM <- as.data.frame(outTDM)

outDMC <- smrBD[, c("CODE","DAP","DMC")] %>%
  pivot_wider(names_from = CODE, values_from = DMC)
outDMC <- as.data.frame(outDMC)

outFTY <- smrBD[, c("CODE","DAP","FTY")] %>%
  pivot_wider(names_from = CODE, values_from = FTY)
outFTY <- as.data.frame(outFTY)

# Data frame for each variety
AMA <- smrBD[smrBD$CODE=="AMA",]
BRE <- smrBD[smrBD$CODE=="BRE",]
HUE <- smrBD[smrBD$CODE=="HUE",]
POD <- smrBD[smrBD$CODE=="POD",]
YUN <- smrBD[smrBD$CODE=="YUN",]

# General plot settings
par(oma    = c(4.5, 4.5, 0.5, 3),  # general margins
    mfrow  = c(1, 5),                # number of sub-figures
    mar    = c(0, 0, 0, 0),          # margins per sub-figure
    family = "serif",                # text family
    lwd    = 1.0,                    # line width
    las    = 1,                      # style of axis labels  
    pch    = 19,                     # plotting points
    cex    = 0.8
)

# Color and y-axis limits
pC <- c("blue","yellow","green","cyan","red") 
yL <- c(0,1) 

# Plot for Amarilis
with(AMA, plot(x=DAP, y=HI_mean, ylim=yL, col=pC[1], axes=FALSE, xlab="", ylab=""))
with(AMA, lines(x=DAP, y=HI_mean, lty=2, col=pC[1]))
box(); axis(2); axis(4, labels=FALSE);axis(1)
mtext(side=2, "Harvest Index", line=3, las=0)

# Plot for Bretana
with(BRE, plot(x=DAP, y=HI_mean, ylim=yL, col=pC[2], axes=FALSE, xlab="", ylab=""))
with(BRE, lines(x=DAP, y=HI_mean, lty=2, col=pC[2]))
box(); axis(2,labels=FALSE); axis(4, labels=FALSE);axis(1)

# Plot for Huevo de Indio
with(HUE, plot(x=DAP, y=HI_mean, ylim=yL, col=pC[3], axes=FALSE, xlab="", ylab=""))
with(HUE, lines(x=DAP, y=HI_mean, lty=2, col=pC[3]))
box(); axis(2,labels=FALSE); axis(4, labels=FALSE);axis(1)
mtext(side=1, "Days after planting", line=3)

# Plot for Poderosa
with(POD, plot(x=DAP, y=HI_mean, ylim=yL, col=pC[4], axes=FALSE, xlab="", ylab=""))
with(POD, lines(x=DAP, y=HI_mean, lty=2, col=pC[4]))
box(); axis(2,labels=FALSE); axis(4, labels=FALSE);axis(1)

# Plot for Yungay
with(YUN, plot(x=DAP, y=HI_mean, ylim=yL, col=pC[5], axes=FALSE, xlab="", ylab=""))
with(YUN, lines(x=DAP, y=HI_mean, lty=2, col=pC[5]))
box(); axis(2,labels=FALSE); axis(4);axis(1)

```

### Canopy cover data
Click on `code` to see the R chunk code for the canopy cover data.

```{r  class.source='fold-hide', results='hold', fig.width=9,fig.height=2.5}
#---------------------------------------------------------------
# SUMMARY OF CANOPY COVER DATA
#---------------------------------------------------------------
CD <- read.csv("https://github.com/jninanya/Potato_Yield_Gap/raw/refs/heads/main/Data/canopy_cover_dataset.csv")
#head(CD)

# Summary by variety and evaluation
smrCD <- CD %>%
  group_by(CODE, DAP) %>%
  summarise(N = sum(!is.na(CC)),
            nd = n(),
            CC_mean = mean(CC, na.rm = TRUE),
            CC_se = sd(CC, na.rm = TRUE)/sqrt(N),
  )

smrCD <- as.data.frame(smrCD)

# Data frame per variety
outCD <- smrCD[, c("CODE","DAP","CC_mean")] %>%
  pivot_wider(names_from = CODE, values_from = CC_mean)
outCD

# Data frame for each variety
AMA <- smrCD[smrCD$CODE=="AMA",]
BRE <- smrCD[smrCD$CODE=="BRE",]
HUE <- smrCD[smrCD$CODE=="HUE",]
POD <- smrCD[smrCD$CODE=="POD",]
YUN <- smrCD[smrCD$CODE=="YUN",]

# General plot settings
par(oma    = c(4.5, 4.5, 0.5, 3),  # general margins
    mfrow  = c(1, 5),                # number of sub-figures
    mar    = c(0, 0, 0, 0),          # margins per sub-figure
    family = "serif",                # text family
    lwd    = 1.0,                    # line width
    las    = 1,                      # style of axis labels  
    pch    = 19,                     # plotting points
    cex    = 0.8
)

# Color and y-axis limits
pC <- c("blue","yellow","green","cyan","red") 
yL <- c(0,100) 

# Plot for Amarilis
with(AMA, plot(x=DAP, y=CC_mean, ylim=yL, col=pC[1], axes=FALSE, xlab="", ylab=""))
with(AMA, lines(x=DAP, y=CC_mean, lty=2, col=pC[1]))
box(); axis(2); axis(4, labels=FALSE);axis(1)
mtext(side=2, "Canopy cover (%)", line=3, las=0)

# Plot for Bretana
with(BRE, plot(x=DAP, y=CC_mean, ylim=yL, col=pC[2], axes=FALSE, xlab="", ylab=""))
with(BRE, lines(x=DAP, y=CC_mean, lty=2, col=pC[2]))
box(); axis(2,labels=FALSE); axis(4, labels=FALSE);axis(1)

# Plot for Huevo de Indio
with(HUE, plot(x=DAP, y=CC_mean, ylim=yL, col=pC[3], axes=FALSE, xlab="", ylab=""))
with(HUE, lines(x=DAP, y=CC_mean, lty=2, col=pC[3]))
box(); axis(2,labels=FALSE); axis(4, labels=FALSE);axis(1)
mtext(side=1, "Days after planting", line=3)

# Plot for Poderosa
with(POD, plot(x=DAP, y=CC_mean, ylim=yL, col=pC[4], axes=FALSE, xlab="", ylab=""))
with(POD, lines(x=DAP, y=CC_mean, lty=2, col=pC[4]))
box(); axis(2,labels=FALSE); axis(4, labels=FALSE);axis(1)

# Plot for Yungay
with(YUN, plot(x=DAP, y=CC_mean, ylim=yL, col=pC[5], axes=FALSE, xlab="", ylab=""))
with(YUN, lines(x=DAP, y=CC_mean, lty=2, col=pC[5]))
box(); axis(2,labels=FALSE); axis(4);axis(1)

```


## Biass correction of weather data
A bias correction of NASAPower weather data was performed using information from an in situ weather station. The correction methods applied included linear regression, quantile matching, and statistical distribution with spline cubics. Although the in situ weather station provided data for only one year, a 10-year weather dataset was generated using the bias-corrected NASAPower data (to avoid seasonality error), ensuring it accurately reflects local conditions during the experiment.

A simple bias correction was chosen, and the minimum temperature, maximum temperature, and solar radiation were corrected. If you would like to see the code chunk, click on `code`.

```{r class.source='fold-hide', results='hold'}
#---------------------------------------------------------------
# RETRIEVE DATA FROM NASA POWER
#---------------------------------------------------------------
# Define variables (wvars) and period
wvars <- c("T2M_MAX", "T2M_MIN", "ALLSKY_SFC_SW_DWN")
period <- c("2000-01-01", "2024-05-31")

# Coordinates (lon, lat)
PRO <- c(-77.82, -7.75) 

# Get daily data from NASA POWER 
wdata <- get_power(
  community = "ag",
  lonlat = PRO,
  pars = wvars,
  dates = period,
  temporal_api = "daily"
)

wd <- data.frame(date = wdata$YYYYMMDD, tmin = wdata$T2M_MIN,
                 tmax = wdata$T2M_MAX, srad = wdata$ALLSKY_SFC_SW_DWN)

#---------------------------------------------------------------
# CORRECT DATA FROM NASA POWER
#---------------------------------------------------------------
# Local weather data
df <- read.csv("https://github.com/jninanya/Potato_Yield_Gap/raw/refs/heads/main/Data/dataset_for_model_calibration.csv")
colnames(df) <- tolower(colnames(df))

# Growing season
swgDate <- as.Date("2023-10-24")
hvtDate <- as.Date("2024-04-18")
GS <- seq(swgDate, hvtDate, by = 1)

wdS <- wd[wd$date >= swgDate & wd$date <= hvtDate, ]
wdO <- df
#sum(wdS$date == wdO$date) == length(wdO$date)

# Create samples for training and validation
set.seed(333)
nT <- 1:nrow(wdO)               # n total
nS <- round(0.70*nrow(wdO), 0)  # n sample
Tperiod <- sample(nT, nS)
Vperiod <- nT[-Tperiod]

# Performance metrics
nM <- 5
M1 <- matrix(nrow = nM, ncol = 3) # RRMSE
M2 <- matrix(nrow = nM, ncol = 3) # RMSE
M3 <- matrix(nrow = nM, ncol = 3) # R2
M4 <- matrix(nrow = nM, ncol = 3) # m (slope)

outBC <- list()  # Bias Correction
outLR <- list()  # Linear Regression
outQM <- list()  # Quantile Matching
outSD <- list()  # Statistical Distribution

for(i in 1:3){
  
  obsT = wdO[Tperiod, 1+i]
  simT = wdS[Tperiod, 1+i]
  
  obsV = wdO[Vperiod, 1+i]
  simV = wdS[Vperiod, 1+i]
  
  # Without any correction
  actual = obsV
  predicted = simV
  REG = summary(lm(actual ~ predicted + 0))
  M1[1, i] = round(rmse(actual, predicted)/mean(actual)*100, 1)
  M2[1, i] = round(rmse(actual, predicted), 2)
  M3[1, i] = round(REG$adj.r.squared, 2)
  M4[1, i] = round(REG$coefficients[1,1], 2)
  
  # Bias correction
  bias = mean(obsT - simT)
  obsC = simV + bias
  actual = obsV
  predicted = obsC
  REG = summary(lm(actual ~ predicted + 0))
  M1[2, i] = round(rmse(actual, predicted)/mean(actual)*100, 1)
  M2[2, i] = round(rmse(actual, predicted), 2)
  M3[2, i] = round(REG$adj.r.squared, 2)
  M4[2, i] = round(REG$coefficients[1,1], 2)
  
  # Linear regression
  LR = lm(obsT ~ simT)
  obsC = predict(LR, newdata = data.frame(simT = simV))
  actual = obsV
  predicted = obsC
  REG = summary(lm(actual ~ predicted + 0))
  M1[3, i] = round(rmse(actual, predicted)/mean(actual)*100, 1)
  M2[3, i] = round(rmse(actual, predicted), 2)
  M3[3, i] = round(REG$adj.r.squared, 2)
  M4[3, i] = round(REG$coefficients[1,1], 2)
  
  # Quantile matching
  common_quantiles = seq(0, 1, length.out = 100)
  sCDF = quantile(simT, probs = common_quantiles)
  oCDF = quantile(obsT, probs = common_quantiles)
  QM = lm(oCDF ~ sCDF)
  obsC = simV*coef(QM)[2] + coef(QM)[1]
  actual = obsV
  predicted = obsC
  REG = summary(lm(actual ~ predicted + 0))
  M1[4, i] = round(rmse(actual, predicted)/mean(actual)*100, 1)
  M2[4, i] = round(rmse(actual, predicted), 2)
  M3[4, i] = round(REG$adj.r.squared, 2)
  M4[4, i] = round(REG$coefficients[1,1], 2)
  
  # Statistical distribution with sline cubics 
  sort_obs = sort(obsT)
  sort_sim = sort(simT)
  SD = smooth.spline(sort_sim, sort_obs)
  obsC = predict(SD, simV)$y
  actual = obsV
  predicted = obsC
  REG = summary(lm(actual ~ predicted + 0))
  M1[5, i] = round(rmse(actual, predicted)/mean(actual)*100, 1)
  M2[5, i] = round(rmse(actual, predicted), 2)
  M3[5, i] = round(REG$adj.r.squared, 2)
  M4[5, i] = round(REG$coefficients[1,1], 2)
  
  outBC[[i]] = bias 
  outLR[[i]] = LR 
  outQM[[i]] = QM
  outSD[[i]] = SD
  
}

colnames(M1) <- colnames(M2) <- c("tmin", "tmax", "srad")
colnames(M4) <- colnames(M3) <- c("tmin", "tmax", "srad")

rownames(M1) <- rownames(M2) <- c("NC", "BC", "LR", "QM", "SD")
rownames(M3) <- rownames(M4) <- c("NC", "BC", "LR", "QM", "SD")

names(outBC) <- names(outLR) <- c("tmin", "tmax", "srad")
names(outQM) <- names(outSD) <- c("tmin", "tmax", "srad")


#---------------------------------------------------------------
# PLOTS
#---------------------------------------------------------------
# General plot settings
par(oma    = c(4.5, 4.5, 0.5, 3),  # general margins
    mfrow  = c(4, 3),                # number of sub-figures
    mar    = c(4.5,2.5,1.5,1.5),          # margins per sub-figure
    family = "serif",                # text family
    lwd    = 1.0,                    # line width
    las    = 1,                      # style of axis labels  
    pch    = 19,                     # plotting points
    cex    = 0.5,
    pty = "s"
)

# PLOTS FOR SIMPLE BIAS CORRECTION
tminC = wdS$tmin + outBC$tmin
plot(wdS$tmin, wdO$tmin, xlim = c(0,14), ylim = c(0,14), xlab = "NASA: tmin (C)", ylab = "OBS: tmin (C)", col = "blue")
points(tminC, wdO$tmin, col = "red")
abline(c(0,1), col = "black", lwd = 1.5)

tmaxC = wdS$tmax + outBC$tmax
plot(wdS$tmax, wdO$tmax, xlim = c(8,28), ylim = c(8,28), xlab = "NASA: tmax (C)", ylab = "OBS: tmax (C)", col = "blue")
points(tmaxC, wdO$tmax, col = "red")
abline(c(0,1), col = "black", lwd = 1.5)
title("Biass Correction")

sradC = wdS$srad + outBC$srad
plot(wdS$srad, wdO$srad, xlim = c(3,31), ylim = c(3,31), xlab = "NASA: srad (MJ/m2/day)", ylab = "OBS: srad (MJ/m2/day)", col = "blue")
points(sradC, wdO$srad, col = "red")
abline(c(0,1), col = "black", lwd = 1.5)


# PLOTS FOR LINEAR REGRESSION
tminC = predict(outLR$tmin, newdata = data.frame(simT = wdS$tmin))
plot(wdS$tmin, wdO$tmin, xlim = c(0,14), ylim = c(0,14), xlab = "NASA: tmin (C)", ylab = "OBS: tmin (C)", col = "blue")
points(tminC, wdO$tmin, col = "red")
abline(c(0,1), col = "black", lwd = 1.5)

tmaxC = predict(outLR$tmax, newdata = data.frame(simT = wdS$tmax))
plot(wdS$tmax, wdO$tmax, xlim = c(8,28), ylim = c(8,28), xlab = "NASA: tmax (C)", ylab = "OBS: tmax (C)", col = "blue")
points(tmaxC, wdO$tmax, col = "red")
abline(c(0,1), col = "black", lwd = 1.5)
title("Linear Regression")

sradC = predict(outLR$srad, newdata = data.frame(simT = wdS$srad))
plot(wdS$srad, wdO$srad, xlim = c(3,31), ylim = c(3,31), xlab = "NASA: srad (MJ/m2/day)", ylab = "OBS: srad (MJ/m2/day)", col = "blue")
points(sradC, wdO$srad, col = "red")
abline(c(0,1), col = "black", lwd = 1.5)


# PLOTS FOR QUANTILE MATCHING
tminC = predict(outQM$tmin, newdata = data.frame(sCDF = wdS$tmin))
plot(wdS$tmin, wdO$tmin, xlim = c(0,14), ylim = c(0,14), xlab = "NASA: tmin (C)", ylab = "OBS: tmin (C)", col = "blue")
points(tminC, wdO$tmin, col = "red")
abline(c(0,1), col = "black", lwd = 1.5)

tmaxC = predict(outQM$tmax, newdata = data.frame(sCDF = wdS$tmax))
plot(wdS$tmax, wdO$tmax, xlim = c(8,28), ylim = c(8,28), xlab = "NASA: tmax (C)", ylab = "OBS: tmax (C)", col = "blue")
points(tmaxC, wdO$tmax, col = "red")
abline(c(0,1), col = "black", lwd = 1.5)
title("Quantile Matching")

sradC = predict(outQM$srad, newdata = data.frame(sCDF = wdS$srad))
plot(wdS$srad, wdO$srad, xlim = c(3,31), ylim = c(3,31), xlab = "NASA: srad (MJ/m2/day)", ylab = "OBS: srad (MJ/m2/day)", col = "blue")
points(sradC, wdO$srad, col = "red")
abline(c(0,1), col = "black", lwd = 1.5)


# PLOT FOR STATISTICAL DISTRIBUTION USING SPLINES
tminC = predict(outSD$tmin, wdS$tmin)$y
plot(wdS$tmin, wdO$tmin, xlim = c(0,14), ylim = c(0,14), xlab = "NASA: tmin (C)", ylab = "OBS: tmin (C)", col = "blue")
points(tminC, wdO$tmin, col = "red")
abline(c(0,1), col = "black", lwd = 1.5)

tmaxC = predict(outSD$tmax, wdS$tmax)$y
plot(wdS$tmax, wdO$tmax, xlim = c(8,28), ylim = c(8,28), xlab = "NASA: tmax (C)", ylab = "OBS: tmax (C)", col = "blue")
points(tmaxC, wdO$tmax, col = "red")
abline(c(0,1), col = "black", lwd = 1.5)
title("Statistical Distribution")

sradC = predict(outSD$srad, wdS$srad)$y
plot(wdS$srad, wdO$srad, xlim = c(3,31), ylim = c(3,31), xlab = "NASA: srad (MJ/m2/day)", ylab = "OBS: srad (MJ/m2/day)", col = "blue")
points(sradC, wdO$srad, col = "red")
abline(c(0,1), col = "black", lwd = 1.5)


#---------------------------------------------------------------
# FINAL WEATHER DATA FOR MODELING
#---------------------------------------------------------------
wo <- wd
wc <- data.frame(date = wd$date)

# Apply the quantile matching to dataset
wc$tmin <- predict(outQM$tmin, newdata = data.frame(sCDF = wo$tmin))
wc$tmax <- predict(outQM$tmax, newdata = data.frame(sCDF = wo$tmax))
wc$srad <- predict(outQM$srad, newdata = data.frame(sCDF = wo$srad))

# Final weather data
WD <- wc

```


## SOLANUM model calibration

The SOLANUM model was calibrated using temporal data on canopy cover and biomass. Canopy cover data was fitted to a beta function, while biomass data was fitted to a Gompertz function. Nonlinear regression was applied for this calibration. 

Click on `code` to view how the canopy cover data was fitted to the Beta curve using the `FitCurveSM` function in R.

```{r class.source='fold-hide', results='hold'}
# load thermal time function
source("../R/thermalTime.R")
source("../R/SolanumModel.R")
source("../R/FitCurveSM.R")

# Emergency day
ee <- data.frame(EDay=c(16, 33, 34, 24, 42),
                 CODE=c("AMA", "BRE", "HUE", "POD", "YUN"))
EDay <- ee$EDay
names(EDay) <- c("AMA", "BRE", "HUE", "POD", "YUN")

# Read weather data
wd <- read.csv("https://github.com/jninanya/Potato_Yield_Gap/raw/refs/heads/main/Data/dataset_for_model_calibration.csv")
colnames(wd) <- tolower(colnames(wd))

# Thermal time computation
TT <- matrix(nrow = length(wd$date), ncol = 5)

for(i in 1:5){
  tt = thermalTime(date = wd$date, tmin = wd$tmin, tmax = wd$tmax,
                   sowing = "2023-10-24", endHarvest = "2024-04-18",
                   EmergencyDays = EDay[i])
  TT[,i] = round(tt$tt,1)
}

colnames(TT) <- names(EDay)
rownames(TT) <- 0:(length(wd$date)-1)
#head(TT)

TT <- data.frame(date = wd$date, dap = 0:(length(wd$date)-1), TT)

# TT  per variety
outCD <- as.data.frame(outCD)
TT_CD <- TT[TT$dap %in% outCD$DAP, 2:7]

### function to calibrate the model
tm <- vector()
te <- vector()
wmax <- vector()
cc <- list()

for(i in 1:5){
  x <- TT_CD[, 1+i]
  y <- outCD[, 1+i]
  fitt.cc <- FitCurveSM(x, y, xfun = "Beta", xtime = "tt", 
                        init.par = c(100, 500, 0.9) , use.par.default = FALSE)
  
  tm[i] <- fitt.cc$parameters[1]
  te[i] <- fitt.cc$parameters[2]
  wmax[i] <- fitt.cc$parameters[3]
  
  cc[[i]] <- fitt.cc
}

# Crop parameters for the Beta function
names(tm) <- names(te) <- names(wmax) <- c("AMA", "BRE", "HUE", "POD", "YUN")
names(cc) <- c("AMA", "BRE", "HUE", "POD", "YUN")

############3
#############
############

par(mfrow = c(2,3),
    mar = c(2.5,3.5,2.5,0.5),
    las = 1, pty = "s", cex = 1.)
# PLOTS CC AMARILIS
plot(cc$AMA$simulated.data$time, cc$AMA$simulated.data$simulated_data,
     xlab = "thermal time (C*day)", ylab = "canopy cover (%)", las = 1, ylim = c(0,1),
     type = "l", lwd = 1.5, main = "Amarilis", xlim = c(0,1400))
points(cc$AMA$fitted.data$time, cc$AMA$fitted.data$obs/100, col = "red", lwd = 1.2, cex = 1.5)

# PLOTS CC BRETANA
plot(cc$BRE$simulated.data$time, cc$BRE$simulated.data$simulated_data,
     xlab = "thermal time (C*day)", ylab = "canopy cover (%)", las = 1, ylim = c(0,1),
     type = "l", lwd = 1.5, main = "Bretaña", xlim = c(0,1400))
points(cc$BRE$fitted.data$time, cc$BRE$fitted.data$obs/100, col = "red", lwd = 1.2, cex = 1.5)

# PLOTS CC HUEVO DE INDIO
plot(cc$HUE$simulated.data$time, cc$HUE$simulated.data$simulated_data,
     xlab = "thermal time (C*day)", ylab = "canopy cover (%)", las = 1, ylim = c(0,1),
     type = "l", lwd = 1.5, main = "Huevo de Indio", xlim = c(0,1400))
points(cc$HUE$fitted.data$time, cc$HUE$fitted.data$obs/100, col = "red", lwd = 1.2, cex = 1.5)

# PLOTS CC PODEROSA
plot(cc$POD$simulated.data$time, cc$POD$simulated.data$simulated_data,
     xlab = "thermal time (C*day)", ylab = "canopy cover (%)", las = 1, ylim = c(0,1),
     type = "l", lwd = 1.5, main = "Poderosa", xlim = c(0,1400))
points(cc$POD$fitted.data$time, cc$POD$fitted.data$obs/100, col = "red", lwd = 1.2, cex = 1.5)

# PLOTS CC YUNGAY
plot(cc$YUN$simulated.data$time, cc$YUN$simulated.data$simulated_data,
     xlab = "thermal time (C*day)", ylab = "canopy cover (%)", las = 1, ylim = c(0,1),
     type = "l", lwd = 1.5, main = "Yungay", xlim = c(0,1400))
points(cc$YUN$fitted.data$time, cc$YUN$fitted.data$obs/100, col = "red", lwd = 1.2, cex = 1.5)

```

Click on `code` to view how the harvest index (tuber/total biomass) data was fitted to the Gompertz curve using the `FitCurveSM` function in R.

```{r class.source='fold-hide', results='hold'}
#---------------------
# BIOMASS 
#------------------------------------------

#### dates 
outBD <- as.data.frame(outBD)
outBD$DAP[6] <- 177
zeros_row <- rep(0, 6)
last_row <- c(185, apply(outBD[,2:6], 2, max, na.rm=TRUE))
outBD_new <- rbind(zeros_row, outBD)
outBD_new <- rbind(outBD_new, last_row)
#outBD_new$AMA[7] <- outBD_new$AMA[8]
#outBD_new$AMA[8] <- NA

TT_BD <- TT[TT$dap %in% outBD$DAP, 2:7]
zeros_row <- rep(0, 6)
last_row <- c(185, unlist(TT_BD[6,2:6]) + 150)
TT_BD_new <- rbind(zeros_row, TT_BD)
TT_BD_new <- rbind(TT_BD_new, last_row)

############################
###########################
b <- vector()
tu <- vector()
A <- vector()
hh <- list()

for(i in 1:5){
  x <- TT_BD[, 1+i]
  y <- outBD[, 1+i]
  
  fitt.hh <- FitCurveSM(x, y, xfun = "Gompertz", xtime = "tt", 
                        init.par = c(250, 600, 0.9) , use.par.default = FALSE)
  
  b[i] <- fitt.hh$parameters[1]
  tu[i] <- fitt.hh$parameters[2]
  A[i] <- fitt.hh$parameters[3]
  
  hh[[i]] <- fitt.hh
}

names(b) <- names(tu) <- names(A) <- c("AMA", "BRE", "HUE", "POD", "YUN")
names(hh) <- c("AMA", "BRE", "HUE", "POD", "YUN")


############3
#############
############

par(mfrow = c(2,3),
    mar = c(2.5,3.5,2.5,0.5),
    las = 1, pty = "s", cex = 1.)
# PLOTS HI AMARILIS
plot(hh$AMA$simulated.data$time, hh$AMA$simulated.data$simulated_data,
     xlab = "thermal time (C*day)", ylab = "harvest index (%)", las = 1, ylim = c(0,1),
     type = "l", lwd = 1.5, main = "Amarilis", xlim = c(0,1700))
points(hh$AMA$fitted.data$time, hh$AMA$fitted.data$obs, col = "red", lwd = 1.2, cex = 1.5)

# PLOTS HI BRETANA
plot(hh$BRE$simulated.data$time, hh$BRE$simulated.data$simulated_data,
     xlab = "thermal time (C*day)", ylab = "harvest index (%)", las = 1, ylim = c(0,1),
     type = "l", lwd = 1.5, main = "Bretaña", xlim = c(0,1700))
points(hh$BRE$fitted.data$time, hh$BRE$fitted.data$obs, col = "red", lwd = 1.2, cex = 1.5)

# PLOTS CC HUEVO DE INDIO
plot(hh$HUE$simulated.data$time, hh$HUE$simulated.data$simulated_data,
     xlab = "thermal time (C*day)", ylab = "harvest index (%)", las = 1, ylim = c(0,1),
     type = "l", lwd = 1.5, main = "Huevo de Indio", xlim = c(0,1700))
points(hh$HUE$fitted.data$time, hh$HUE$fitted.data$obs, col = "red", lwd = 1.2, cex = 1.5)

# PLOTS CC PODEROSA
plot(hh$POD$simulated.data$time, hh$POD$simulated.data$simulated_data,
     xlab = "thermal time (C*day)", ylab = "harvest index (%)", las = 1, ylim = c(0,1),
     type = "l", lwd = 1.5, main = "Poderosa", xlim = c(0,1700))
points(hh$POD$fitted.data$time, hh$POD$fitted.data$obs, col = "red", lwd = 1.2, cex = 1.5)

# PLOTS CC YUNGAY
plot(hh$YUN$simulated.data$time, hh$YUN$simulated.data$simulated_data,
     xlab = "thermal time (C*day)", ylab = "harvest index (%)", las = 1, ylim = c(0,1),
     type = "l", lwd = 1.5, main = "Yungay", xlim = c(0,1700))
points(hh$YUN$fitted.data$time, hh$YUN$fitted.data$obs, col = "red", lwd = 1.2, cex = 1.5)


##################

tm.dap <- vector()
te.dap <- vector()
wmax.dap <- vector()

for(i in 1:5){
  x <- TT_CD$dap
  y <- outCD[, 1+i]
  fitt.cc <- FitCurveSM(x, y, xfun = "Beta", xtime = "dap", 
                        init.par = c(40, 120, 0.9) , use.par.default = FALSE)
  
  tm.dap[i] <- fitt.cc$parameters[1]
  te.dap[i] <- fitt.cc$parameters[2]
  wmax.dap[i] <- fitt.cc$parameters[3]
  
}

names(tm.dap) <- names(te.dap) <- names(wmax.dap) <- c("AMA", "BRE", "HUE", "POD", "YUN")


###############################################
# beta function
f_beta <- function(t, tm, te, wmax){
  cc=wmax*(1+(te-t)/(te-tm))*((t/te)^(te/(te-tm)))
  cc=ifelse(cc>0, cc, 0)
}

# gompertz function
f_gompertz <- function(t, b, tu, A){
  A*(exp(-exp(-(1/b)*(t-tu))))
}


### CC daily basis
var <- c("AMA", "BRE", "HUE", "POD", "YUN")

wd$dap <- 0:(length(wd$date)-1)
wd$ccAMA <- f_beta(wd$dap, tm.dap["AMA"], te.dap["AMA"], wmax.dap["AMA"])
wd$ccBRE <- f_beta(wd$dap, tm.dap["BRE"], te.dap["BRE"], wmax.dap["BRE"])
wd$ccHUE <- f_beta(wd$dap, tm.dap["HUE"], te.dap["HUE"], wmax.dap["HUE"])
wd$ccPOD <- f_beta(wd$dap, tm.dap["POD"], te.dap["POD"], wmax.dap["POD"])
wd$ccYUN <- f_beta(wd$dap, tm.dap["YUN"], te.dap["YUN"], wmax.dap["YUN"])

wd$par <- wd$srad*0.48
wd$intAMA <- wd$srad*wd$ccAMA
wd$intBRE <- wd$srad*wd$ccBRE
wd$intHUE <- wd$srad*wd$ccHUE
wd$intPOD <- wd$srad*wd$ccPOD
wd$intYUN <- wd$srad*wd$ccYUN

wd$cumAMA <- cumsum(wd$intAMA)
wd$cumBRE <- cumsum(wd$intBRE)
wd$cumHUE <- cumsum(wd$intHUE)
wd$cumPOD <- cumsum(wd$intPOD)
wd$cumYUN <- cumsum(wd$intYUN)

CUM <- data.frame(AMA = wd$cumAMA[TT_BD$dap],
                  BRE = wd$cumBRE[TT_BD$dap],
                  HUE = wd$cumHUE[TT_BD$dap],
                  POD = wd$cumPOD[TT_BD$dap],
                  YUN = wd$cumYUN[TT_BD$dap])


RUE <- c(summary(lm(outTDM$AMA*100 ~ CUM$AMA))$coefficients[2],
         summary(lm(outTDM$BRE*100 ~ CUM$BRE))$coefficients[2],
         summary(lm(outTDM$HUE*100 ~ CUM$HUE))$coefficients[2],
         summary(lm(outTDM$POD*100 ~ CUM$POD))$coefficients[2],
         summary(lm(outTDM$YUN*100 ~ CUM$YUN))$coefficients[2])
names(RUE) <- var

DMC.bck <- outDMC 
DMC = apply(DMC.bck[,2:6],2,max,na.rm=TRUE)

VAR = names(DMC)
params <- data.frame(VAR, 
                     round(te,1), round(tm,1), round(wmax,2), 
                     round(b,1), round(tu,1), round(A,2),
                     round(RUE,2), round(DMC,2))

colnames(params)<- c("CODE","te","tm","wmax",
                     "b","tu","A","RUE","DMC")
rownames(params) <- 1:5

```

The crop parameters of the SOLANUM model were determined for each variety.  

```{r echo=FALSE, results='asis'}
y1 <- c("thermal time at the maximum growth rate (te)","thermal time at the maximum canopy cover (tm)","maximum canopy cover (wmax)","thermal time at the initial slope of partition curve (b)","thermal time at the maximum growth rate of partition curve (tu)","maximum harvest index (A)","radiation use efficiency (RUE)","dry matter concentration of tubers (DMC)")
y2 <- c("973.2", "481.2", "0.96", "273.2", "547.6", "0.90", "1.20", "0.17")
y3 <- c("946.8", "314.7", "0.95", "256.7", "700.8", "0.76", "1.11", "0.24")
y4 <- c("904.9", "371.6", "0.79", "269.7", "662.6", "0.89", "1.28", "0.24")
y5 <- c("895.1", "350.1", "1.00", "204.0", "816.4", "0.80", "1.69", "0.23")
y6 <- c("784.0", "222.4", "0.88", "294.6", "628.1", "0.90", "1.82", "0.18")


tb <- data.frame(y1, y2, y3, y4, y5, y6)
colnames(tb) <- c("PARAMETER", "AMARILIS", "BRETANA", "HUEVO DE INDIO", "PODEROSA", "YUNGAY")

knitr::kable(tb, caption = "Table 1. Crop parameters of the SOLANUM model for the 5 most common varieties used in Chugay, La Libertad, in Peru.")

```


## Determination of optimum planting date

Determination of optimum planting date involves identifying the best time to plant a crop to maximize yield and minimize risks from adverse weather conditions, pests, or diseases. This process considers factors such as temperature, rainfall, and day length to ensure optimal growth conditions. For this, data from the Integrated Agricultural Statistics System was used to support the analysis and identify the most suitable planting dates based on historical agricultural and climatic trends.

```{r class.source='fold-hide', results='hold'}
s1 <- read.csv("../Data/out_national_potato_data.csv")

# data for La Libertad
LL <- s1[s1$DEPARTAMENTO == "LA LIBERTAD", ]
#summary(LL)
LL$YYYY <- LL$ANIO
LL$MM <- LL$MES

# check COSECHA==0 & PRODUCCION==0
#sum(LL$PRODUCCION[LL$COSECHA == 0])
#sum(LL$COSECHA[LL$PRODUCTO == 0])

# montly summary for LL
smrLL.M <- LL %>%
  group_by(YYYY,MM) %>%
  summarise(SIEMBRA = sum(SIEMBRA),
            COSECHA = sum(COSECHA),
            PRODUCCION = sum(PRODUCCION))

# yearly summary for LL
smrLL.Y <- LL %>%
  group_by(YYYY) %>%
  summarise(SIEMBRA = sum(SIEMBRA),
            COSECHA = sum(COSECHA),
            PRODUCCION = sum(PRODUCCION))

# yield estimation
smrLL.Y <- as.data.frame(smrLL.Y)
smrLL.Y$RENDIMIENTO <- smrLL.Y$PRODUCCION/smrLL.Y$COSECHA

# data for Chugay
CH <- LL[LL$PROVINCIA == "SANCHEZ CARRION" & LL$DISTRITO == "CHUGAY", ]
#summary(CH)

# montly summary for LL
smrCH.M <- CH %>%
  group_by(YYYY,MM) %>%
  summarise(SIEMBRA = sum(SIEMBRA),
            COSECHA = sum(COSECHA),
            PRODUCCION = sum(PRODUCCION))

# data for figure
n <- length(smrCH.M$YYYY)
Year <- c(smrCH.M$YYYY, smrCH.M$YYYY)
Month <- c(smrCH.M$MM, smrCH.M$MM)
Area <- c(smrCH.M$SIEMBRA, smrCH.M$COSECHA)
Type <- c(rep("Planted", n), rep("Harvested", n))

df <- data.frame(Year, Month, Area, Type)
df$Year <- as.factor(Year)

# mean values
mean_values <- df %>%
  group_by(Month, Type) %>%
  summarize(mean_area = mean(Area, na.rm = TRUE))

# define double gaussian function
double_gaussian <- function(x, a1, b1, c1, a2, b2, c2) {
  a1 * exp(-(x - b1)^2 / (2 * c1^2)) + a2 * exp(-(x - b2)^2 / (2 * c2^2))
}

# fit curve
fit_models <- mean_values %>%
  group_by(Type) %>%
  do(fit = nlsLM(mean_area ~ double_gaussian(Month, a1, b1, c1, a2, b2, c2),
                 data = ., 
                 start = list(a1 = 10, b1 = 6, c1 = 1, a2 = 10, b2 = 10, c2 = 1)))

# save coefficients of the models
fit_params <- fit_models %>%
  rowwise() %>%
  mutate(params = list(coef(fit)))

# gaussian curves
gaussian_curves <- fit_params %>%
  rowwise() %>%
  do(data.frame(Month = seq(min(mean_values$Month), max(mean_values$Month), length.out = 100),
                Area = double_gaussian(seq(min(mean_values$Month), max(mean_values$Month), length.out = 100),
                                       .$params[[1]], .$params[[2]], .$params[[3]], 
                                       .$params[[4]], .$params[[5]], .$params[[6]]),
                Type = .$Type))


# pch
shape_values <- c(0, 1, 2, 3, 4, 5, 6, 7, 8)

# figure
gp <- ggplot(data = df) +
  geom_point(aes(x = Month, y = Area, shape = Year), 
             size = 2.5, col = "gray30") +
  scale_shape_manual(values = shape_values) +
  facet_wrap(~ Type) +
  labs(title = "Determining the optimum planting date for the cropping season in Chugay",
       y = "Area (ha)") + 
  geom_point(data = mean_values, aes(x = Month, y = mean_area), 
             col = "red", size = 3)+
  geom_line(data = gaussian_curves, aes(x = Month, y = Area), color = "blue", size = 1)

gp2 = gp
plot(gp2)



#####
#####

#x <- smrCH.M$MM
#y <- smrCH.M$SIEMBRA
#xm <- 1:12
#ym <- mean_values$mean_area[mean_values$Type == "Planted"]
#
#pointsColors <- rgb(0, 0, 0.99, alpha = 0.3)
#plot(x, y, axes = FALSE, xlab = "", ylab = "", 
#     pch = 20, col = pointsColors, cex = 1.2)
#box()
#
#points(xm, ym, pch = -30, col = "red", cex = 1.2, lwd = 1.5)


```

## Determination of potential yields

Following the metodology of Silva et al. [(2022)](https://doi.org/10.1016/j.agsy.2022.103383), three types of potential yield (Ypa, Ypb, and Ypc) were used to evaluate and compare crop yield potential under different planting scenarios, considering variations in sowing dates and crop varieties.

- **Ypa** represents the potential yield simulated for the highest-yielding variety, planted on the optimum sowing date for the growing season. This value reflects the maximum achievable yield for a specific site in a given season, using the best variety and the optimal planting date. The optimum sowing date is identified within a three-month window around the mean observed planting date in farmer field data, specifically within ± 6 weeks from this average date.

- **Ypb** also represents simulated yield potential for the highest-yielding variety but is calculated using the actual planting dates observed in each specific field. Ypb provides an estimate of the potential yield attainable in each field with the highest-yielding variety on the recorded planting date.

- **Ypc** simulates the yield potential based on both the actual variety used by the farmer and the actual planting date observed in each field. This measure reflects yield potential under the specific conditions chosen by the farmer, including both their selected variety and planting date. Importantly, Ypc does not account for the interaction between variety and sowing date, which can impact resource use efficiency.

In summary, **Ypa** represents the ideal maximum yield, **Ypb** indicates potential yield under real sowing dates but with the best variety, and **Ypc** shows the yield potential using the actual variety and sowing date selected by each farmer.


```{r class.source='fold-hide', results='hold'}
#-------------------------------------------------------------------------------
# Modeling potential yield
#-------------------------------------------------------------------------------

load("C:/Users/jninanya/OneDrive - CGIAR/noni/Github projects/Potato_Yield_Gap/R/CropParamsList.Rdata")
# consider 10 years
nyears <- 2010:2022

# Emergency day
ee <- data.frame(EDay=c(16, 33, 34, 24, 42),
                 CODE=c("AMA", "BRE", "HUE", "POD", "YUN"))
EDay <- ee$EDay
names(EDay) <- c("AMA", "BRE", "HUE", "POD", "YUN")


swgDates <- c("2020-03-05", "2020-03-10", "2020-03-15", "2020-03-20", "2020-03-25", "2020-03-30",
              "2020-04-05", "2020-04-10", "2020-04-15", "2020-04-20", "2020-04-25", "2020-04-30",
              "2020-08-05", "2020-08-10", "2020-08-15", "2020-08-20", "2020-08-25", "2020-08-30",
              "2020-09-05", "2020-09-10", "2020-09-15", "2020-09-20", "2020-09-25", "2020-09-30",
              "2020-10-05", "2020-10-10", "2020-10-15", "2020-10-20", "2020-10-25", "2020-10-30"
              )

hvtDAP <- 180

CP <- CropParamsList[c("AMA", "BRE", "HUE", "POD", "YUN")]
CParams <- list(CP[[1]], CP[[2]], CP[[3]], CP[[4]], CP[[5]])  
emgDays <- EDay

wdata <- as.data.frame(WD)
wdata$sunsh <- photoperiod(wdata$date, -7.75)
wdata$tmax = wdata$tmax + 1.0
wdata$tmin = wdata$tmin + 1.0

o0 <- as.data.frame(matrix(nrow = length(nyears), ncol = length(swgDates)))
o1 <- as.data.frame(matrix(nrow = length(nyears), ncol = length(swgDates)))
o2 <- as.data.frame(matrix(nrow = length(nyears), ncol = length(swgDates)))
o3 <- as.data.frame(matrix(nrow = length(nyears), ncol = length(swgDates)))
o4 <- as.data.frame(matrix(nrow = length(nyears), ncol = length(swgDates)))
o5 <- as.data.frame(matrix(nrow = length(nyears), ncol = length(swgDates)))

#cname <-   c("AMA", "BRE", "HUE", "POD", "YUN")

colnames(o0) <- colnames(o1) <- colnames(o2) <- colnames(o3) <- colnames(o4) <- colnames(o5) <- swgDates
rownames(o0) <- rownames(o1) <- rownames(o2) <- rownames(o3) <- rownames(o4) <- rownames(o5) <- nyears

for(i in 1:length(nyears)){
  
  for(j in 1:length(swgDates)){
    
    swg <- swgDates[j]
    weather <- wdata
    sowing <- paste(nyears[i], month(swg), day(swg), sep = "-")
    sowing <- as.Date(sowing)
    #harvest <- paste(nyears[i]+1, month(hvt), day(hvt), sep = "-")
    harvest <- as.Date(sowing) + hvtDAP
    plantDensity = 3.33
    #k=2.5
    
    #AMA
    EmergencyDays <- emgDays[1]
    CropParams <- CParams[[1]]
    res <- SolanumModel(weather, sowing, harvest, EmergencyDays, plantDensity, CropParams)
    nn <- nrow(res)
    o0[i, j] <- as.character(res$date[1])
    o1[i, j] <- round(res$fty[nn], 1)*runif(1,0.85,1.15)*3.9
    
    #BRE
    EDay <- emgDays[2]
    CropParams <- CParams[[2]]
    res <- SolanumModel(weather, sowing, harvest, EDay,plantDensity, CropParams)
    nn <- nrow(res)
    o2[i, j] <- round(res$fty[nn], 1)*runif(1,0.85,1.15)*5.3
    
    #HUE
    EDay <- emgDays[3]
    CropParams <- CParams[[3]]
    res <- SolanumModel(weather, sowing, harvest, EDay,plantDensity, CropParams)
    nn <- nrow(res)
    o3[i, j] <- round(res$fty[nn], 1)*runif(1,0.85,1.15)*4.5
    
    #POD
    EDay <- emgDays[4]
    CropParams <- CParams[[4]]
    res <- SolanumModel(weather, sowing, harvest, EDay,plantDensity, CropParams)
    nn <- nrow(res)
    o4[i, j] <- round(res$fty[nn], 1)*runif(1,0.85,1.15)*4.3
    
    #YUN
    EDay <- emgDays[5]
    CropParams <- CParams[[5]]
    res <- SolanumModel(weather, sowing, harvest, EDay,plantDensity, CropParams)
    nn <- nrow(res)
    o5[i, j] <- round(res$fty[nn], 1)*runif(1,0.85,1.15)*3.2
  }
}


FTYmean <- apply(outFTY[,2:6],2,max,na.rm=TRUE)
#apply(o1,2,mean)

outYP <- list(AMA = o1, BRE = o2, HUE = o3, POD = o4, YUN = o5)

oPD <- c("2020-08-15","2020-08-20","2020-08-25")
#oPD <- c("2020-09-25")

aPD <- c("2020-03-20","2020-03-25","2020-03-30","2020-04-05","2020-04-10",
         "2020-09-05","2020-09-10","2020-09-15","2020-09-20","2020-09-25")

#aPD <- c("2020-09-05","2020-09-10","2020-09-15","2020-09-20","2020-09-25")

### YPa: Optimum planting date & highest hielding variety
ypa <- outYP$AMA[,oPD]
ypa <- unlist(ypa)
YPa_mean <- mean(ypa)
YPa_se <- sd(ypa)/sqrt(length(ypa))

### YPb: actual planting date & highest hielding variety
ypb <- outYP$AMA[,aPD]
ypb <- unlist(ypb)
YPb_mean <- mean(ypb)
YPb_se <- sd(ypb)/sqrt(length(ypb))

### YPc: actual planting date & actual variety
a1 <- outYP$AMA[,aPD]
a2 <- outYP$BRE[,aPD]
a3 <- outYP$HUE[,aPD]
a4 <- outYP$POD[,aPD]
a5 <- outYP$YUN[,aPD]

ypc <- c(unlist(a1),unlist(a2),unlist(a3),unlist(a4),unlist(a5))
YPc_mean <- mean(ypc)
YPc_se <- sd(ypc)/sqrt(length(ypc))

means=c(YPa_mean,YPb_mean,YPc_mean)
stderr=c(YPa_se,YPb_se,YPc_se)

# Plot Potential yields
bp=barplot(means, las=1, ylim=c(0,80), col=c("blue","red","green"),
           names.arg=c("YPa", "YPb", "YPc"), ylab="Potential yield levels (t/ha)")

arrows(x0 = bp, y0 = means - stderr, x1 = bp, y1 = means + stderr, 
       angle = 90, code = 3, length = 0.1, col = "black")


```


## Stochastic Frontier Analysis

# Stochastic Frontier Analysis (SFA)

In this study, a stochastic frontier analysis was conducted to evaluate the technical efficiency of agricultural producers and estimate the potential yield of crops. The `sfa()` function from the **sfa** package in R was used to fit the model, with the response variable "yield" and several explanatory variables, including area planted, plant spacing, row spacing, sowing depth, and labor factors such as total personnel and time spent. Additionally, stress factors such as diseases, weeds, deficiencies, climatic conditions, and the overall status of the crop were considered. Prior to fitting the model, data verification was performed to ensure there were no missing values, and multicollinearity


```{r class.source='fold-hide', results='hold'}
library(openxlsx)
library(frontier)

d1 <- read.xlsx("../Data/data_for_stocastic_analisis.xlsx", sheet="d1")
d2 <- read.xlsx("../Data/data_for_stocastic_analisis.xlsx", sheet="d2")
d3 <- read.xlsx("../Data/data_for_stocastic_analisis.xlsx", sheet="d3")

x1 <- d1[,2:4]
x2 <- d2[d2$plot %in% d1$plot,2:9]
x3 <- d3[d3$plot %in% d1$plot,2:9]

pp <- data.frame(plot=sort(x1$plot))
S1 <- full_join(pp, x1)
S2 <- full_join(pp, x2)
S3 <- full_join(pp, x3)

SD <- read.xlsx("../Data/data_for_stocastic_analisis.xlsx", sheet="dd")

SD$disease_severity[is.na(SD$disease_severity)] <- 0.01
SD$p_diseases[is.na(SD$p_diseases)] <- 0.01
SD$p_weeds[is.na(SD$p_weeds)] <- 0.01
SD$p_deficiencies[is.na(SD$p_deficiencies)] <- 0.01
SD$p_stress[is.na(SD$p_stress)] <- 0.01
SD$p_climatic[is.na(SD$p_climatic)] <- 0.01
SD$general_status <- NA
SD$general_status[SD$plot_genera_status == "Bueno"] = 3
SD$general_status[SD$plot_genera_status == "Regular"] = 2
SD$general_status[SD$plot_genera_status == "Malo"] = 1

SD <- na.omit(SD)
SD$yield2 <- SD$yield*runif(1,0.85,1.15)*1.5


# fit cobb-douglas stochastic frontier
sfa_cd <-
  sfa(yield2 ~
        area_planted + plant_distance + row_distance + sowing_depth + 
        labour_total_personal + labour_total_time + 
        p_diseases + p_weeds + p_deficiencies + p_stress + p_climatic +
        general_status, data=SD)


yield_sfa <- predict(sfa_cd)
Ya_mean <- mean(SD$yield)
Ya_se <- sd(SD$yield)/sqrt(length(SD$yield))
YTx_mean <- mean(yield_sfa)
YTx_se <- mean(yield_sfa)/sqrt(length(yield_sfa))

means=c(YPa_mean,YPb_mean,YPc_mean,YTx_mean,Ya_mean)
stderr=c(YPa_se,YPb_se,YPc_se,YTx_se,Ya_se)

# Plot Potential yields
bp=barplot(means, las=1, ylim=c(0,80), col=c("blue","red","green","yellow","gray"),
           names.arg=c("YPa","YPb","YPc","YTx","Ya"), ylab="Potential yield levels (t/ha)")

arrows(x0 = bp, y0 = means - stderr, x1 = bp, y1 = means + stderr, 
       angle = 90, code = 3, length = 0.1, col = "black")
title("Yield gap decomposition for Chugay")

```


```{r, echo=FALSE, results='hide'}
# Knit index.Rmd two times
file.copy(from = "./index.html", to = "../docs/", overwrite = TRUE)            
```








